diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 3251f4ee9a7..0e9181ac55a 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-c42ac54d9e817bf0a0366eb78e6c8beba4d5eff5
+aec9b2ab77389967ef39bb9c10662fd0fe3e185a
diff --git a/.ci/docker/ci_commit_pins/torchao.txt b/.ci/docker/ci_commit_pins/torchao.txt
new file mode 100644
index 00000000000..768110b82ff
--- /dev/null
+++ b/.ci/docker/ci_commit_pins/torchao.txt
@@ -0,0 +1 @@
+0916b5b29b092afcbf2b898caae49abe80662bac
diff --git a/.ci/docker/common/install_linter.sh b/.ci/docker/common/install_linter.sh
index 4a796a72d54..d262176e49b 100755
--- a/.ci/docker/common/install_linter.sh
+++ b/.ci/docker/common/install_linter.sh
@@ -13,3 +13,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 # NB: Install all linter dependencies, the caching of lintrunner init could be
 # done after Executorch becomes public
 pip_install -r requirements-lintrunner.txt
+
+# Install google-java-format
+curl -L --retry 3 https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64 > /opt/google-java-format
+chmod +x /opt/google-java-format
diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh
index 260072f7342..deeaed34ac3 100644
--- a/.ci/scripts/build-qnn-sdk.sh
+++ b/.ci/scripts/build-qnn-sdk.sh
@@ -6,11 +6,12 @@
 # LICENSE file in the root directory of this source tree.
 
 set -eux
+set -o xtrace
 
 build_qnn_backend() {
   echo "Start building qnn backend."
   export ANDROID_NDK_ROOT=/opt/ndk
-  export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531
+  export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
 
   bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release
@@ -26,8 +27,9 @@ set_up_aot() {
       -DCMAKE_INSTALL_PREFIX=$PWD \
       -DEXECUTORCH_BUILD_QNN=ON \
       -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
-      -DEXECUTORCH_BUILD_SDK=ON \
+      -DEXECUTORCH_BUILD_DEVTOOLS=ON \
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
       -DPYTHON_EXECUTABLE=python3 \
       -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF
diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh
index 644fc4c2bb7..7d3370ee561 100644
--- a/.ci/scripts/build_llama_android.sh
+++ b/.ci/scripts/build_llama_android.sh
@@ -22,8 +22,9 @@ install_executorch_and_backend_lib() {
     -DANDROID_PLATFORM=android-23 \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
diff --git a/.ci/scripts/setup-ios.sh b/.ci/scripts/setup-ios.sh
new file mode 100755
index 00000000000..519cd2581eb
--- /dev/null
+++ b/.ci/scripts/setup-ios.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+# This script follows the instructions from GitHub to install an Apple certificate
+# https://docs.github.com/en/actions/use-cases-and-examples/deploying/installing-an-apple-certificate-on-macos-runners-for-xcode-development
+
+CERTIFICATE_PATH="${RUNNER_TEMP}"/build_certificate.p12
+PP_PATH="${RUNNER_TEMP}"/build_pp.mobileprovision
+KEYCHAIN_PATH="${RUNNER_TEMP}"/app-signing.keychain-db
+
+# Import certificate and provisioning profile from secrets
+echo -n "$BUILD_CERTIFICATE_BASE64" | base64 --decode -o $CERTIFICATE_PATH
+echo -n "$BUILD_PROVISION_PROFILE_BASE64" | base64 --decode -o $PP_PATH
+
+# Create a temporary keychain
+security create-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH
+security set-keychain-settings -lut 21600 $KEYCHAIN_PATH
+security unlock-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH
+
+# Import certificate to the keychain
+security import $CERTIFICATE_PATH -P "" -A -t cert -f pkcs12 -k $KEYCHAIN_PATH
+security set-key-partition-list -S apple-tool:,apple: -k "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH
+security list-keychain -d user -s $KEYCHAIN_PATH
+
+# Apply provisioning profile
+mkdir -p ~/Library/MobileDevice/Provisioning\ Profiles
+cp $PP_PATH ~/Library/MobileDevice/Provisioning\ Profiles
diff --git a/.ci/scripts/setup-linux.sh b/.ci/scripts/setup-linux.sh
index 4bccabad5cf..5df4668f65c 100755
--- a/.ci/scripts/setup-linux.sh
+++ b/.ci/scripts/setup-linux.sh
@@ -20,6 +20,5 @@ fi
 
 # As Linux job is running inside a Docker container, all of its dependencies
 # have already been installed
-install_flatc_from_source
 install_executorch
 build_executorch_runner "${BUILD_TOOL}"
diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh
index 2be7d9efe83..833ba0aafe6 100755
--- a/.ci/scripts/setup-macos.sh
+++ b/.ci/scripts/setup-macos.sh
@@ -128,7 +128,5 @@ if [[ -z "${GITHUB_RUNNER:-}" ]]; then
 fi
 
 print_cmake_info
-install_pytorch_and_domains
-install_flatc_from_source
 install_executorch
 build_executorch_runner "${BUILD_TOOL}"
diff --git a/.ci/scripts/setup-qnn-deps.sh b/.ci/scripts/setup-qnn-deps.sh
index 3b39e1aafe3..92ffd07bccc 100644
--- a/.ci/scripts/setup-qnn-deps.sh
+++ b/.ci/scripts/setup-qnn-deps.sh
@@ -7,14 +7,18 @@
 
 set -ex
 
+verify_pkg_installed() {
+  echo $(dpkg-query -W --showformat='${Status}\n' $1|grep "install ok installed")
+}
+
 install_qnn() {
   echo "Start installing qnn."
   QNN_INSTALLATION_DIR=/tmp/qnn
   mkdir -p "${QNN_INSTALLATION_DIR}"
 
-  curl -Lo /tmp/v2.23.0.24.06.24.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.23.0.24.06.24.zip"
+  curl -Lo /tmp/v2.25.0.24.07.28.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip"
   echo "Finishing downloading qnn sdk."
-  unzip -qo /tmp/v2.23.0.24.06.24.zip -d /tmp
+  unzip -qo /tmp/v2.25.0.24.07.28.zip -d /tmp
   echo "Finishing unzip qnn sdk."
 
 
@@ -26,4 +30,22 @@ install_qnn() {
   ls -lah "${QNN_INSTALLATION_DIR}"
 }
 
+setup_libc++() {
+  sudo apt-get update
+  pkgs_to_check=('libc++-dev')
+  j=0
+  while [ $j -lt ${#pkgs_to_check[*]} ]; do
+    install_status=$(verify_pkg_installed ${pkgs_to_check[$j]})
+    if [ "$install_status" == "" ]; then
+      sudo apt-get install -y ${pkgs_to_check[$j]}
+      if [[ $? -ne 0 ]]; then
+        echo "ERROR: Failed to install required packages for libc++"
+        exit 1
+      fi
+    fi
+    j=$(( $j +1));
+  done
+}
+
+setup_libc++
 install_qnn
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 4fa8c94905f..2e51866d902 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -11,7 +11,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
 MODEL_NAME=$1 # stories110M
 BUILD_TOOL=$2 # buck2 or cmake
-DTYPE=$3 # fp16 or fp32
+DTYPE=$3 # fp16, bf16, or fp32
 MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
 UPLOAD_DIR=${5:-}
 if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
@@ -29,7 +29,7 @@ if [[ -z "${BUILD_TOOL:-}" ]]; then
 fi
 
 if [[ -z "${DTYPE:-}" ]]; then
-  echo "Missing dtype, choose fp16 or fp32, exiting..."
+  echo "Missing dtype, choose fp16, bf16, or fp32, exiting..."
   exit 1
 fi
 
@@ -75,7 +75,7 @@ echo "COREML option ${COREML}"
 if [[ "${MODE}" =~ .*qnn.* ]]; then
   QNN=ON
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
-  export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531
+  export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
   export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
   export PYTHONPATH=".."
   cp schema/program.fbs exir/_serialize/program.fbs
@@ -107,8 +107,9 @@ cmake_install_executorch_libraries() {
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE=Debug \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
@@ -173,6 +174,8 @@ fi
 EXPORTED_MODEL_NAME="llama2"
 if [[ "${DTYPE}" == "fp16" ]]; then
   EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}_h"
+elif [[ "${DTYPE}" == "bf16" ]]; then
+  EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}_bf"
 elif [[ "${DTYPE}" == "fp32" ]]; then
   :
 else
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
index 60589c96d47..8ac87b2302d 100644
--- a/.ci/scripts/test_llava.sh
+++ b/.ci/scripts/test_llava.sh
@@ -8,44 +8,99 @@
 set -exu
 # shellcheck source=/dev/null
 
+BUILD_TYPE=${1:-Debug}
+TARGET_OS=${2:-Native}
+BUILD_DIR=${3:-cmake-out}
+
+echo "Building with BUILD_TYPE: $BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR"
+
 if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
-  PYTHON_EXECUTABLE=python3
+    PYTHON_EXECUTABLE=python3
 fi
 
+TARGET_OS_lower="$(echo "${TARGET_OS}" | awk '{print tolower($0)}')"
+if [[ "${TARGET_OS_lower}" == "android" ]]; then
+    if [[ -z "${ANDROID_NDK}" ]]; then
+        echo "Set ANDROID_NDK environment variable to build for Android."
+        exit 1
+    fi
+fi
+
+# Number of processes for a parallel build
+NPROC=8
+if hash nproc &> /dev/null; then NPROC=$(nproc); fi
+
+EXECUTORCH_COMMON_CMAKE_ARGS="                      \
+        -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}         \
+        -DCMAKE_BUILD_TYPE=${BUILD_TYPE}            \
+        -DEXECUTORCH_ENABLE_LOGGING=ON              \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
+        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON      \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON        \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON     \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON     \
+        -DEXECUTORCH_BUILD_XNNPACK=ON               \
+        -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON        \
+        -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON"
+
 cmake_install_executorch_libraries() {
-    cmake                                               \
-        -DCMAKE_INSTALL_PREFIX=cmake-out                \
-        -DCMAKE_BUILD_TYPE=Debug                        \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON          \
-        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON     \
-        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON            \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON         \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON         \
-        -DEXECUTORCH_BUILD_XNNPACK=ON                   \
-        -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON            \
-        -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON        \
-        -Bcmake-out .
-
-
-    cmake --build cmake-out -j9 --target install --config Debug
+    cmake                               \
+        ${EXECUTORCH_COMMON_CMAKE_ARGS} \
+        -B${BUILD_DIR} .
+
+    cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
 }
 
+cmake_install_executorch_libraries_for_android() {
+    cmake                                                                       \
+        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+        -DANDROID_ABI=arm64-v8a                                                 \
+        -DANDROID_PLATFORM=android-23                                           \
+        ${EXECUTORCH_COMMON_CMAKE_ARGS}                                         \
+        -B${BUILD_DIR} .
+
+    cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
+}
+
+
+LLAVA_COMMON_CMAKE_ARGS="                        \
+        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+        -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}      \
+        -DCMAKE_BUILD_TYPE=${BUILD_TYPE}         \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON     \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON  \
+        -DEXECUTORCH_BUILD_XNNPACK=ON"
+
 cmake_build_llava_runner() {
     dir=examples/models/llava
     python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
 
-    cmake                                       \
-        -DCMAKE_INSTALL_PREFIX=cmake-out        \
-        -DCMAKE_BUILD_TYPE=Debug                \
-        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON    \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_XNNPACK=ON           \
-        -DCMAKE_PREFIX_PATH="$python_lib"       \
-        -Bcmake-out/${dir}                      \
+    cmake                                 \
+        ${LLAVA_COMMON_CMAKE_ARGS}        \
+        -DCMAKE_PREFIX_PATH="$python_lib" \
+        -B${BUILD_DIR}/${dir}             \
         ${dir}
 
+    cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE}
+}
+
 
-    cmake --build cmake-out/${dir} -j9 --config Debug
+cmake_build_llava_runner_for_android() {
+    dir=examples/models/llava
+    python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
+
+    cmake                                                                       \
+        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+        -DANDROID_ABI=arm64-v8a                                                 \
+        -DANDROID_PLATFORM=android-23                                           \
+        ${LLAVA_COMMON_CMAKE_ARGS}                                              \
+        -DCMAKE_PREFIX_PATH="$python_lib"                                       \
+        -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON                                  \
+        -B${BUILD_DIR}/${dir}                                                   \
+        ${dir}
+
+    cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE}
 }
 
 # only export the one without custom op for now since it's
@@ -54,6 +109,13 @@ export_llava() {
     $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
 }
 
+# Download a new image with different size, to test if the model can handle different image sizes
+prepare_image_tensor() {
+    echo "Downloading image"
+    curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg
+    $PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt
+}
+
 run_and_verify() {
     NOW=$(date +"%H:%M:%S")
     echo "Starting to run llava runner at ${NOW}"
@@ -69,17 +131,33 @@ run_and_verify() {
         echo "tokenizer.bin is missing."
         exit 1
     fi
-    RUNTIME_ARGS="--model_path=llava.pte \
-     --tokenizer_path=tokenizer.bin \
-     --image_path=image.pt \
-     --prompt=ASSISTANT: \
-     --temperature=0 \
-     --seq_len=650"
-    cmake-out/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt
+
+
+
+    RUNTIME_ARGS="--model_path=llava.pte    \
+        --tokenizer_path=tokenizer.bin      \
+        --image_path=image.pt               \
+        --prompt=ASSISTANT:                 \
+        --temperature=0                     \
+        --seq_len=650"
+
+    if [[ "${TARGET_OS_lower}" == "android" ]]; then
+        echo "Transfer relevant files to the phone via ADB and run llava_main with following args,"
+        echo "$ llava_main ${RUNTIME_ARGS} "
+        exit 0;
+    fi
+
+    ${BUILD_DIR}/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt
+
     # verify result.txt
     RESULT=$(cat result.txt)
     # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
-    EXPECTED_PREFIX="ASSISTANT:"
+    if [[ "$(uname)" == "Darwin" ]]; then
+        EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various"
+    else
+        # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
+        EXPECTED_PREFIX="ASSISTANT:"
+    fi
     if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
         echo "Expected result prefix: ${EXPECTED_PREFIX}"
         echo "Actual result: ${RESULT}"
@@ -93,7 +171,20 @@ run_and_verify() {
     fi
 }
 
-cmake_install_executorch_libraries
-cmake_build_llava_runner
+# Step1. Build stuff
+if [[ "${TARGET_OS_lower}" == "android" ]]; then
+    cmake_install_executorch_libraries_for_android
+    cmake_build_llava_runner_for_android
+elif [[ "${TARGET_OS_lower}" == "native" ]]; then
+    cmake_install_executorch_libraries
+    cmake_build_llava_runner
+else
+    echo "Invalid TARGET_OS ($2): ${TARGET_OS}"
+fi
+
+# Step2. Generate the PTE
 export_llava
+
+# Step3. Run
+prepare_image_tensor
 run_and_verify
diff --git a/.ci/scripts/test.sh b/.ci/scripts/test_model.sh
similarity index 75%
rename from .ci/scripts/test.sh
rename to .ci/scripts/test_model.sh
index 1f20042f02a..f558a508c93 100755
--- a/.ci/scripts/test.sh
+++ b/.ci/scripts/test_model.sh
@@ -50,13 +50,13 @@ prepare_artifacts_upload() {
 
 build_cmake_executor_runner() {
   echo "Building executor_runner"
-  (rm -rf ${CMAKE_OUTPUT_DIR} \
-    && mkdir ${CMAKE_OUTPUT_DIR} \
-    && cd ${CMAKE_OUTPUT_DIR} \
-    && retry cmake -DCMAKE_BUILD_TYPE=Release \
-      -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
+  rm -rf ${CMAKE_OUTPUT_DIR}
+  cmake -DCMAKE_BUILD_TYPE=Debug \
+      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+      -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+      -B${CMAKE_OUTPUT_DIR} .
 
-  cmake --build ${CMAKE_OUTPUT_DIR} -j4
+  cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
 }
 
 run_portable_executor_runner() {
@@ -64,9 +64,7 @@ run_portable_executor_runner() {
   if [[ "${BUILD_TOOL}" == "buck2" ]]; then
     buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./${MODEL_NAME}.pte"
   elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
-    if [[ ! -f ${CMAKE_OUTPUT_DIR}/executor_runner ]]; then
-      build_cmake_executor_runner
-    fi
+    build_cmake_executor_runner
     ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./${MODEL_NAME}.pte"
   else
     echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm"
@@ -156,9 +154,41 @@ test_model_with_qnn() {
   export PYTHONPATH=$EXECUTORCH_ROOT/..
 
   if [[ "${MODEL_NAME}" == "dl3" ]]; then
-    "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.deeplab_v3 -b ${CMAKE_OUTPUT_DIR} -m SM8550 --compile_only --download
-    EXPORTED_MODEL=./deeplab_v3/dlv3_qnn.pte
+    EXPORT_SCRIPT=deeplab_v3
+    EXPORTED_MODEL_NAME=dlv3_qnn.pte
+  elif [[ "${MODEL_NAME}" == "mv3" ]]; then
+    EXPORT_SCRIPT=mobilenet_v3
+    EXPORTED_MODEL_NAME=mv3_qnn.pte
+  elif [[ "${MODEL_NAME}" == "mv2" ]]; then
+    EXPORT_SCRIPT=mobilenet_v2
+    EXPORTED_MODEL_NAME=mv2_qnn.pte
+  elif [[ "${MODEL_NAME}" == "ic4" ]]; then
+    EXPORT_SCRIPT=inception_v4
+    EXPORTED_MODEL_NAME=ic4_qnn.pte
+  elif [[ "${MODEL_NAME}" == "ic3" ]]; then
+    EXPORT_SCRIPT=inception_v3
+    EXPORTED_MODEL_NAME=ic3_qnn.pte
+  elif [[ "${MODEL_NAME}" == "vit" ]]; then
+    EXPORT_SCRIPT=torchvision_vit
+    EXPORTED_MODEL_NAME=vit_qnn.pte
   fi
+
+  # Use SM8450 for S22, SM8550 for S23, and SM8560 for S24
+  # TODO(guangyang): Make QNN chipset matches the target device
+  QNN_CHIPSET=SM8450
+
+  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only
+  EXPORTED_MODEL=./${EXPORT_SCRIPT}/${EXPORTED_MODEL_NAME}
+}
+
+test_model_with_coreml() {
+  if [[ "${BUILD_TOOL}" == "buck2" ]]; then
+    echo "coreml doesn't support buck2."
+    exit 1
+  fi
+
+  "${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}"
+  EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit)
 }
 
 if [[ "${BACKEND}" == "portable" ]]; then
@@ -170,9 +200,21 @@ elif [[ "${BACKEND}" == "qnn" ]]; then
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi
+elif [[ "${BACKEND}" == "coreml" ]]; then
+  echo "Testing ${MODEL_NAME} with coreml..."
+  test_model_with_coreml
+  if [[ $? -eq 0 ]]; then
+    prepare_artifacts_upload
+  fi
 elif [[ "${BACKEND}" == "xnnpack" ]]; then
   echo "Testing ${MODEL_NAME} with xnnpack..."
-  test_model_with_xnnpack true true
+  WITH_QUANTIZATION=true
+  WITH_DELEGATION=true
+  if [[ "$MODEL_NAME" == "mobilebert" ]]; then
+    # TODO(T197452682)
+    WITH_QUANTIZATION=false
+  fi
+  test_model_with_xnnpack "${WITH_QUANTIZATION}" "${WITH_DELEGATION}"
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi
diff --git a/.ci/scripts/test_phi_3_mini.sh b/.ci/scripts/test_phi_3_mini.sh
new file mode 100644
index 00000000000..40767013e23
--- /dev/null
+++ b/.ci/scripts/test_phi_3_mini.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+BUILD_TYPE=${1:-Debug}
+BUILD_DIR=${3:-cmake-out}
+MODEL_DIR=examples/models/phi-3-mini
+
+echo "Building with BUILD_TYPE: $BUILD_TYPE, BUILD_DIR: $BUILD_DIR"
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+    PYTHON_EXECUTABLE=python3
+fi
+
+# Number of processes for a parallel build
+NPROC=8
+if hash nproc &> /dev/null; then NPROC=$(nproc); fi
+
+cmake_install_executorch_libraries() {
+  cmake -DPYTHON_EXECUTABLE=python \
+      -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
+      -DEXECUTORCH_ENABLE_LOGGING=1 \
+      -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+      -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+      -DEXECUTORCH_BUILD_XNNPACK=ON \
+      -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+      -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+      -B${BUILD_DIR} .
+
+  cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
+}
+
+cmake_build_phi_3_mini() {
+  cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
+      -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
+      -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+      -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+      -DEXECUTORCH_BUILD_XNNPACK=ON \
+      -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+      -B${BUILD_DIR}/${MODEL_DIR} \
+      ${MODEL_DIR}
+
+  cmake --build ${BUILD_DIR}/${MODEL_DIR} -j${NPROC} --config ${BUILD_TYPE}
+}
+
+# Download and convert tokenizer.model
+prepare_tokenizer() {
+  echo "Downloading and converting tokenizer.model"
+  wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true"
+  $PYTHON_EXECUTABLE -m executorch.extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+}
+
+# Export phi-3-mini model to pte
+export_phi_3_mini () {
+  echo "Exporting phi-3-mini. This will take a few minutes"
+  $PYTHON_EXECUTABLE -m executorch.examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
+}
+
+run_and_verify() {
+    NOW=$(date +"%H:%M:%S")
+    echo "Starting to run phi-3-mini runner at ${NOW}"
+    if [[ ! -f "phi-3-mini.pte" ]]; then
+        echo "Export failed. Abort"
+        exit 1
+    fi
+    if [[ ! -f "tokenizer.bin" ]]; then
+        echo "tokenizer.bin is missing."
+        exit 1
+    fi
+
+    ${BUILD_DIR}/${MODEL_DIR}/phi_3_mini_runner \
+    --model_path=phi-3-mini.pte \
+    --tokenizer_path=tokenizer.bin \
+    --seq_len=128 \
+    --temperature=0 \
+    --prompt="<|system|>
+You are a helpful assistant.<|end|>
+<|user|>
+What is the capital of France?<|end|>
+<|assistant|>" > result.txt
+
+    # verify result.txt
+    RESULT=$(cat result.txt)
+    EXPECTED_RESULT="The capital of France is Paris."
+    if [[ "${RESULT}" == *"${EXPECTED_RESULT}"* ]]; then
+        echo "Expected result prefix: ${EXPECTED_RESULT}"
+        echo "Actual result: ${RESULT}"
+        echo "Success"
+        exit 0
+    else
+        echo "Expected result prefix: ${EXPECTED_RESULT}"
+        echo "Actual result: ${RESULT}"
+        echo "Failure; results not the same"
+        exit 1
+    fi
+}
+
+# Step 1. Build ExecuTorch and phi-3-mini runner
+cmake_install_executorch_libraries
+cmake_build_phi_3_mini
+
+# Step 2. Export the tokenizer and model
+prepare_tokenizer
+export_phi_3_mini
+
+# Step 3. Run and verify result
+run_and_verify
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index ebc5361d00a..64c512cdccd 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -33,42 +33,6 @@ install_pip_dependencies() {
   popd || return
 }
 
-install_domains() {
-  echo "Install torchvision and torchaudio"
-  pip install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${TORCHAUDIO_VERSION}"
-  pip install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git@${TORCHVISION_VERSION}"
-}
-
-install_pytorch_and_domains() {
-  pushd .ci/docker || return
-  TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
-  popd || return
-
-  git clone https://github.com/pytorch/pytorch.git
-
-  # Fetch the target commit
-  pushd pytorch || return
-  git checkout "${TORCH_VERSION}"
-  git submodule update --init --recursive
-
-  export _GLIBCXX_USE_CXX11_ABI=0
-  # Then build and install PyTorch
-  python setup.py bdist_wheel
-  pip install "$(echo dist/*.whl)"
-
-  # Grab the pinned audio and vision commits from PyTorch
-  TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
-  export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
-  export TORCHVISION_VERSION
-
-  install_domains
-
-  popd || return
-  # Print sccache stats for debugging
-  sccache --show-stats || true
-}
-
 install_flatc_from_source() {
   # NB: This function could be used to install flatbuffer from source
   pushd third-party/flatbuffers || return
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index f684d83fa51..2b66829ed0a 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -1,5 +1,7 @@
 # The schema is from https://github.com/pytorch/pytorch/blob/main/.github/pytorch-probot.yml
 ciflow_push_tags:
+- ciflow/android
+- ciflow/apple
 - ciflow/nightly
 - ciflow/trunk
 - ciflow/binaries
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 78cd342c874..78c1a2dd096 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -15,7 +15,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: samsung_galaxy_s2x
+        default: samsung_galaxy_s22
       delegates:
         description: Backend delegates
         required: false
@@ -45,7 +45,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: samsung_galaxy_s2x
+        default: samsung_galaxy_s22
       delegates:
         description: Backend delegates
         required: false
@@ -84,9 +84,9 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: "stories110M"
-          CRON_DEFAULT_DEVICES: "samsung_galaxy_s2x"
-          CRON_DEFAULT_DELEGATES: "xnnpack"
+          CRON_DEFAULT_MODELS: "stories110M,dl3,mv3,mv2,ic4,ic3,vit"
+          CRON_DEFAULT_DEVICES: "samsung_galaxy_s22"
+          CRON_DEFAULT_DELEGATES: "xnnpack,qnn"
         run: |
           set -ex
           MODELS="${{ inputs.models }}"
@@ -104,7 +104,8 @@ jobs:
 
           # Mapping devices to their corresponding device-pool-arn
           declare -A DEVICE_POOL_ARNS
-          DEVICE_POOL_ARNS[samsung_galaxy_s2x]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
+          DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
+          DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db"
 
           # Resolve device names with their corresponding ARNs
           if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
@@ -139,6 +140,7 @@ jobs:
       submodules: 'true'
       timeout: 60
       upload-artifact: android-models
+      upload-artifact-to-s3: true
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         echo "::group::Setting up dev environment"
@@ -156,54 +158,28 @@ jobs:
         BUILD_MODE="cmake"
         DTYPE="fp32"
 
-        if [[ ${{ matrix.model }} == "stories*"" ]]; then
+        if [[ ${{ matrix.model }} =~ ^stories* ]]; then
             # Install requirements for export_llama
             PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
             # Test llama2
             if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
                 DELEGATE_CONFIG="xnnpack+custom+qe"
+            elif [[ ${{ matrix.delegate }} == "qnn" ]]; then
+                DELEGATE_CONFIG="qnn"
+            else
+                echo "Unsupported delegate ${{ matrix.delegate }}"
+                exit 1
             fi
             PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}"
         else
-            PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}"
+            PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}"
         fi
         echo "::endgroup::"
 
-  # Upload models to S3. The artifacts are needed not only by the device farm but also TorchChat
-  upload-models:
-    needs: export-models
-    runs-on: linux.2xlarge
-    steps:
-      - name: Download the models from GitHub
-        uses: actions/download-artifact@v3
-        with:
-          # The name here needs to match the name of the upload-artifact parameter
-          name: android-models
-          path: ${{ runner.temp }}/artifacts/
-
-      - name: Verify the models
-        shell: bash
-        working-directory: ${{ runner.temp }}/artifacts/
-        run: |
-          ls -lah ./
-
-      - name: Upload the models to S3
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifact
-          retention-days: 1
-          if-no-files-found: ignore
-          path: ${{ runner.temp }}/artifacts/
-
   build-llm-demo:
     name: build-llm-demo
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     needs: set-parameters
-    strategy:
-      matrix:
-          tokenizer: [bpe]
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
@@ -211,6 +187,7 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       upload-artifact: android-apps
+      upload-artifact-to-s3: true
       script: |
         set -eux
 
@@ -220,37 +197,11 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
-        # TODO: This needs to be replaced with a generic loader .apk
-        # Build LLM Demo for Android
-        bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME}
-
-  # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
-  upload-android-apps:
-    needs: build-llm-demo
-    runs-on: linux.2xlarge
-    steps:
-      - name: Download the apps from GitHub
-        uses: actions/download-artifact@v3
-        with:
-          # The name here needs to match the name of the upload-artifact parameter
-          name: android-apps
-          path: ${{ runner.temp }}/artifacts/
-
-      - name: Verify the apps
-        shell: bash
-        working-directory: ${{ runner.temp }}/artifacts/
-        run: |
-          ls -lah ./
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
-      - name: Upload the apps to S3
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifact
-          retention-days: 14
-          if-no-files-found: ignore
-          path: ${{ runner.temp }}/artifacts/
+        export ANDROID_ABIS="arm64-v8a"
+        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
 
   # Let's see how expensive this job is, we might want to tone it down by running it periodically
   benchmark-on-device:
@@ -260,14 +211,17 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
     needs:
       - set-parameters
-      - upload-models
-      - upload-android-apps
+      - build-llm-demo
+      - export-models
     strategy:
       matrix:
         model: ${{ fromJson(needs.set-parameters.outputs.models) }}
         delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
         device: ${{ fromJson(needs.set-parameters.outputs.devices) }}
+      fail-fast: false
     with:
+      # Due to scheduling a job may be pushed beyond the default 60m threshold
+      timeout: 120
       device-type: android
       runner: linux.2xlarge
       test-infra-ref: ''
@@ -278,10 +232,9 @@ jobs:
       # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer.
       # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only
       # one app+flavor that could load and run the model.
-      # TODO: Hard code llm_demo_bpe for now in this job.
-      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug.apk
-      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug-androidTest.apk
+      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk
+      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk
       # NB: Need to set the default spec here so that it works for periodic too
       test-spec: ${{ inputs.test_spec || 'https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml' }}
       # Uploaded to S3 from the previous job
-      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
+      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
index e33b6e78334..54e9dbb7619 100644
--- a/.github/workflows/android.yml
+++ b/.github/workflows/android.yml
@@ -5,6 +5,8 @@ on:
     branches:
       - main
       - release/*
+    tags:
+      - ciflow/android/*
   pull_request:
     paths:
       - .ci/docker/**
@@ -24,9 +26,6 @@ jobs:
   build-llm-demo:
     name: build-llm-demo
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      matrix:
-          tokenizer: [bpe, tiktoken]
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
@@ -34,6 +33,7 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       upload-artifact: android-apps
+      upload-artifact-to-s3: true
       script: |
         set -eux
 
@@ -44,44 +44,13 @@ jobs:
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
         # Build LLM Demo for Android
-        bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME}
-
-  # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
-  upload-artifacts:
-    needs: build-llm-demo
-    runs-on: linux.2xlarge
-    steps:
-      - name: Download the artifacts from GitHub
-        uses: actions/download-artifact@v3
-        with:
-          # The name here needs to match the name of the upload-artifact parameter
-          name: android-apps
-          path: ${{ runner.temp }}/artifacts/
-
-      - name: Verify the artifacts
-        shell: bash
-        working-directory: ${{ runner.temp }}/artifacts/
-        run: |
-          ls -lah ./
-
-      - name: Upload the artifacts to S3
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifact
-          # NOTE: Consume stale artifacts won't make sense for benchmarking as the goal is always to
-          # benchmark models as fresh as possible. I'm okay to keep the 14 retention-days for now
-          # for TorchChat until we have a periodic job can publish it more often. Ideally I want to
-          # reduce it to <= 2 day, meaning the benchmark job will run daily.
-          retention-days: 14
-          if-no-files-found: ignore
-          path: ${{ runner.temp }}/artifacts/
+        bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
 
   # Running Android emulator directly on the runner and not using Docker
   run-emulator:
     needs: build-llm-demo
-    runs-on: amz2023.linux.4xlarge
+    # NB: Use metal install for KVM support to run the emulator faster
+    runs-on: linux.24xl.spr-metal
     env:
       ANDROID_NDK_VERSION: r26c
       API_LEVEL: 34
@@ -129,9 +98,6 @@ jobs:
         uses: reactivecircus/android-emulator-runner@v2
         with:
           api-level: ${{ env.API_LEVEL }}
-          # NB: x86_64 emulator is slow because the lack of KVM support on AWS, it
-          # seems that we can use metal instance for that but it hasn't been tried
-          # out yet. Also arm64-v8a arch requires an ARM runner
           arch: x86_64
           script: ./build/run_android_emulator.sh
           # NB: This is to boot the emulator faster following the instructions on
@@ -144,36 +110,3 @@ jobs:
           emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -camera-back none
           # This is to make sure that the job doesn't fail flakily
           emulator-boot-timeout: 900
-
-  # Let's see how expensive this job is, we might want to tone it down by running it periodically
-  test-llama-app:
-    # Only PR from ExecuTorch itself has permission to access AWS, forked PRs will fail to
-    # authenticate with the cloud service
-    if: ${{ !github.event.pull_request.head.repo.fork }}
-    needs: upload-artifacts
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
-    strategy:
-      matrix:
-        # https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/README.md#alternative-2-build-from-local-machine
-        # mentions that tiktoken is only for Llama3. So, we can export it later in another archive
-        # like https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b-0717.zip when this is
-        # updated to run Llama3
-        tokenizer: [bpe]
-    with:
-      device-type: android
-      runner: linux.2xlarge
-      test-infra-ref: ''
-      # This is the ARN of ExecuTorch project on AWS
-      project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
-      # This is the custom Android device pool that only includes Samsung Galaxy S2x
-      device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa
-      # Uploaded to S3 from the previous job, the name of the app comes from the project itself
-      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug.apk
-      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug-androidTest.apk
-      test-spec: https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml
-      # Among the input, this is the biggest file, so it is cached on AWS to make the test faster. Note that the file is deleted by AWS after 30
-      # days and the job will automatically re-upload the file when that happens.
-      extra-data: https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b-0717.zip
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
new file mode 100644
index 00000000000..b4b1d3aef58
--- /dev/null
+++ b/.github/workflows/apple-perf.yml
@@ -0,0 +1,308 @@
+name: apple-perf
+
+on:
+  schedule:
+    - cron: 0 1 * * *
+  # Note: GitHub has an upper limit of 10 inputs
+  workflow_dispatch:
+    inputs:
+      models:
+        description: Models to be benchmarked
+        required: false
+        type: string
+        default: stories110M
+      devices:
+        description: Target devices to run benchmark
+        required: false
+        type: string
+        default: apple_iphone_15
+      delegates:
+        description: Backend delegates
+        required: false
+        type: string
+        default: xnnpack
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+      test_spec:
+        description: The test spec to drive the test on AWS devices
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      models:
+        description: Models to be benchmarked
+        required: false
+        type: string
+        default: stories110M
+      devices:
+        description: Target devices to run benchmark
+        required: false
+        type: string
+        default: apple_iphone_15
+      delegates:
+        description: Backend delegates
+        required: false
+        type: string
+        default: xnnpack
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+      test_spec:
+        description: The test spec to drive the test on AWS devices
+        required: false
+        type: string
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  set-parameters:
+    runs-on: linux.2xlarge
+    outputs:
+      models: ${{ steps.set-parameters.outputs.models }}
+      devices: ${{ steps.set-parameters.outputs.devices }}
+      delegates: ${{ steps.set-parameters.outputs.delegates }}
+    steps:
+      - name: Set parameters
+        id: set-parameters
+        shell: bash
+        env:
+          # Separate default values from the workflow dispatch. To ensure defaults are accessible
+          # during scheduled runs and to provide flexibility for different defaults between
+          # on-demand and periodic benchmarking.
+          CRON_DEFAULT_MODELS: "stories110M,mv3,ic4,resnet50,edsr,mobilebert,w2l"
+          CRON_DEFAULT_DEVICES: "apple_iphone_15"
+          CRON_DEFAULT_DELEGATES: "xnnpack,coreml"
+        run: |
+          set -ex
+          MODELS="${{ inputs.models }}"
+          if [ -z "$MODELS" ]; then
+            MODELS="$CRON_DEFAULT_MODELS"
+          fi
+          DEVICES="${{ inputs.devices }}"
+          if [ -z "$DEVICES" ]; then
+            DEVICES="$CRON_DEFAULT_DEVICES"
+          fi
+          DELEGATES="${{ inputs.delegates }}"
+          if [ -z "$DELEGATES" ]; then
+            DELEGATES="$CRON_DEFAULT_DELEGATES"
+          fi
+
+          # Mapping devices to their corresponding device-pool-arn
+          declare -A DEVICE_POOL_ARNS
+          DEVICE_POOL_ARNS[apple_iphone_15]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d"
+
+          # Resolve device names with their corresponding ARNs
+          if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
+            DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")')
+          fi
+          declare -a MAPPED_ARNS=()
+          for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do
+            if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then
+              echo "Error: No ARN found for device '$DEVICE'. Abort." >&2
+              exit 1
+            fi
+            MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}")
+          done
+
+          echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
+          MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .)
+          echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT
+          echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
+
+  export-models:
+    name: export-models
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    needs: set-parameters
+    strategy:
+      matrix:
+          model: ${{ fromJson(needs.set-parameters.outputs.models) }}
+          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+      fail-fast: false
+    with:
+      # NB: Need to use our AWS MacOS runner to upload large models to S3
+      runner: macos-m1-stable
+      python-version: '3.11'
+      submodules: 'true'
+      timeout: 60
+      upload-artifact: ios-models
+      upload-artifact-to-s3: true
+      script: |
+        set -eux
+
+        echo "::group::Setting up CI environment"
+        .ci/scripts/setup-conda.sh
+
+        BUILD_TOOL=cmake
+        # Setup MacOS dependencies as there is no Docker support on MacOS atm
+        GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+          .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+
+        if [[ ${{ matrix.delegate }} == "coreml" ]]; then
+          PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+            backends/apple/coreml/scripts/install_requirements.sh
+        fi
+
+        if [[ ${{ matrix.delegate }} == "mps" ]]; then
+          PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+            backends/apple/mps/install_requirements.sh
+        fi
+
+        ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
+        echo "::endgroup::"
+
+        echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
+        BUILD_MODE="cmake"
+        DTYPE="fp32"
+
+        if [[ ${{ matrix.model }} =~ ^stories* ]]; then
+          # Install requirements for export_llama
+          PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+            bash examples/models/llama2/install_requirements.sh
+
+          # Test llama2
+          if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
+            DELEGATE_CONFIG="xnnpack+custom+qe"
+          elif [[ ${{ matrix.delegate }} == "coreml" ]]; then
+            DELEGATE_CONFIG="coreml"
+          fi
+          PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+            bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}"
+        else
+          PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+            bash .ci/scripts/test_model.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}"
+        fi
+        echo "::endgroup::"
+
+  build-benchmark-app:
+    name: build-benchmark-app
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    needs:
+      - set-parameters
+    secrets: inherit
+    with:
+      runner: macos-latest-xlarge
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      upload-artifact: ios-apps
+      secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD
+      timeout: 90
+      script: |
+        set -eux
+
+        echo "::group::Setting up CI environment"
+        .ci/scripts/setup-conda.sh
+
+        BUILD_TOOL=cmake
+        # Setup MacOS dependencies as there is no Docker support on MacOS atm
+        GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+        .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
+
+        # Setup Apple certificate for iOS development
+        BUILD_PROVISION_PROFILE_BASE64="${SECRET_EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64}" \
+        BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \
+        KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \
+        .ci/scripts/setup-ios.sh
+
+        # Install CoreML Backend Requirements
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+          backends/apple/coreml/scripts/install_requirements.sh
+
+        # Install MPS Backend Requirements
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+          backends/apple/mps/install_requirements.sh
+        echo "::endgroup::"
+
+        echo "::group::Build ExecuTorch iOS frameworks"
+        FRAMEWORKS=(
+          "executorch"
+          "backend_coreml"
+          "backend_mps"
+          "backend_xnnpack"
+          "kernels_custom"
+          "kernels_optimized"
+          "kernels_portable"
+          "kernels_quantized"
+        )
+
+        # Build Release iOS Frameworks
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+          build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack
+
+        mkdir -p extension/apple/Benchmark/Frameworks
+        for FRAMEWORK in "${FRAMEWORKS[@]}"; do (
+          cp -r "cmake-out/${FRAMEWORK}.xcframework" extension/apple/Benchmark/Frameworks/
+        ) done
+        echo "::endgroup::"
+
+        # NB: Although exported models can be copied to this directory and bundled together with the
+        # app, we don't use this in CI and rely on AWS extra data parameter to make the model and the
+        # tokenizer available to the benchmark. This decouples the app and the model. We just need to
+        # create the directory here to pass the build
+        mkdir -p extension/apple/Benchmark/Models
+        ${CONDA_RUN} --no-capture-output \
+          build/build_apple_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+
+  upload-benchmark-app:
+    needs: build-benchmark-app
+    runs-on: linux.2xlarge
+    steps:
+      - name: Download the apps from GitHub
+        uses: actions/download-artifact@v3
+        with:
+          # The name here needs to match the name of the upload-artifact parameter
+          name: ios-apps
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Verify the apps
+        shell: bash
+        working-directory: ${{ runner.temp }}/artifacts/
+        run: |
+          ls -lah ./
+
+      - name: Upload the apps to S3
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifacts
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ${{ runner.temp }}/artifacts/
+
+  benchmark-on-device:
+    needs:
+      - set-parameters
+      - upload-benchmark-app
+      - export-models
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
+    strategy:
+      matrix:
+        model: ${{ fromJson(needs.set-parameters.outputs.models) }}
+        delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+        device: ${{ fromJson(needs.set-parameters.outputs.devices) }}
+      fail-fast: false
+    with:
+      # Due to scheduling a job may be pushed beyond the default 60m threshold
+      timeout: 120
+      device-type: ios
+      # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS
+      runner: linux.2xlarge
+      test-infra-ref: ''
+      # This is the ARN of ExecuTorch project on AWS
+      project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
+      device-pool-arn: ${{ matrix.device }}
+      # Uploaded to S3 from the previous job
+      ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.ipa
+      ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.xctestrun.zip
+      test-spec: ${{ inputs.test_spec || 'https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml' }}
+      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
index 60022b81f9e..229d8e5abf6 100644
--- a/.github/workflows/apple.yml
+++ b/.github/workflows/apple.yml
@@ -8,13 +8,14 @@ on:
   pull_request:
     paths:
       - .ci/docker/**
+      - .ci/scripts/setup-ios.sh
       - .github/workflows/apple.yml
       - install_requirements.sh
       - backends/apple/**
       - build/build_apple_frameworks.sh
       - build/create_frameworks.sh
       - build/test_ios_ci.sh
-      - examples/demo-apps/**
+      - examples/demo-apps/apple_ios/**
       - extension/apple/**
       - extension/module/**
   workflow_dispatch:
@@ -24,27 +25,89 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  test-demo-ios:
-    name: test-demo-ios
+  build-demo-ios:
+    name: build-demo-ios
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    secrets: inherit
     with:
       runner: macos-latest-xlarge
       python-version: '3.11'
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
+      secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_DEMO_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD
+      upload-artifact: ios-apps
       script: |
         BUILD_TOOL=cmake
 
         .ci/scripts/setup-conda.sh
 
+        # Setup Apple certificate for iOS development
+        BUILD_PROVISION_PROFILE_BASE64="${SECRET_EXECUTORCH_DEMO_BUILD_PROVISION_PROFILE_BASE64}" \
+        BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \
+        KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \
+        .ci/scripts/setup-ios.sh
+
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
         .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
 
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
+
         # Build and test iOS Demo App
         PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-        build/test_ios_ci.sh
+        build/test_ios_ci.sh ${ARTIFACTS_DIR_NAME}
+
+  # Upload the test demo app to S3
+  upload-demo-ios:
+    needs: build-demo-ios
+    runs-on: linux.2xlarge
+    steps:
+      - name: Download the artifacts from GitHub
+        uses: actions/download-artifact@v3
+        with:
+          # The name here needs to match the name of the upload-artifact parameter
+          name: ios-apps
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Verify the artifacts
+        shell: bash
+        working-directory: ${{ runner.temp }}/artifacts/
+        run: |
+          ls -lah ./
+
+      - name: Upload the artifacts to S3
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifact
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ${{ runner.temp }}/artifacts/
+
+  test-demo-ios:
+    # Only PR from ExecuTorch itself has permission to access AWS, forked PRs will fail to
+    # authenticate with the cloud service. So, this job will be skipped on the latter
+    if: ${{ !github.event.pull_request.head.repo.fork }}
+    needs: upload-demo-ios
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
+    with:
+      device-type: ios
+      # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS
+      runner: linux.2xlarge
+      test-infra-ref: ''
+      # This is the ARN of ExecuTorch project on AWS
+      project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
+      # This is the custom device pool that only includes iOS devices
+      device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d
+      # Uploaded to S3 from the previous job
+      ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/ExecuTorchDemo.ipa
+      ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/ExecuTorchDemo.xctestrun.zip
+      test-spec: https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml
 
   build-frameworks-ios:
     name: build-frameworks-ios
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 7cb2cf69b8b..56b70409d79 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -54,3 +54,25 @@ jobs:
           lint.json || true
 
         exit $RC
+
+  android-java-format:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-linter
+      fetch-depth: 0
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        FILES_NEEDS_FORMAT=$(/opt/google-java-format -n extension/android/src/main/java/org/pytorch/executorch/*.java \
+          examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/*.java \
+          examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/*.java \
+          extension/android/benchmark/app/src/main/java/org/pytorch/minibench/*.java)
+        if [ -n "$FILES_NEEDS_FORMAT" ]; then
+          echo "Warning: The following files need formatting. Please use google-java-format."
+          echo "Use a binary from https://github.com/google/google-java-format/releases/"
+          echo "For example:"
+          echo "wget https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64"
+          echo "chmod +x google-java-format_linux-x86-64"
+          echo "./google-java-format_linux-x86-64 -i $FILES_NEEDS_FORMAT"
+          exit 1
+        fi
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 4cc57b0c7f1..df13140ca92 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -62,4 +62,4 @@ jobs:
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
         # Build and test ExecuTorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 3e346c716e7..f7d2b627bc5 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -54,7 +54,7 @@ jobs:
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
         # Build and test ExecuTorch with the add model on portable backend.
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "add" "${BUILD_TOOL}" "portable"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "add" "${BUILD_TOOL}" "portable"
 
   test-models-linux:
     name: test-models-linux
@@ -81,7 +81,7 @@ jobs:
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
         # Build and test ExecuTorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
 
   test-llama-runner-linux:
     name: test-llama-runner-linux
@@ -91,6 +91,13 @@ jobs:
         dtype: [fp32]
         build-tool: [buck2, cmake]
         mode: [portable, xnnpack+custom, xnnpack+custom+qe]
+        include:
+          - dtype: bf16
+            build-tool: cmake
+            mode: portable
+          - dtype: bf16
+            build-tool: buck2
+            mode: portable
       fail-fast: false
     with:
       runner: linux.2xlarge
@@ -407,3 +414,30 @@ jobs:
         PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
+
+  test-phi-3-mini-runner-linux:
+    name: test-phi-3-mini-runner-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.24xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+        # install pybind
+        bash install_requirements.sh --pybind xnnpack
+
+        # install phi-3-mini requirements
+        bash examples/models/phi-3-mini/install_requirements.sh
+
+        # run e2e (export, tokenizer and runner)
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 98d14824638..d7130561fa6 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -59,7 +59,7 @@ jobs:
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
         # Build and test xecutorch
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
 
   test-custom-ops-macos:
     name: test-custom-ops-macos
@@ -143,7 +143,6 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         source .ci/scripts/utils.sh
-        install_flatc_from_source
         install_executorch
 
         install_arm
@@ -169,7 +168,6 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         source .ci/scripts/utils.sh
-        install_flatc_from_source
         install_executorch
 
         install_arm
@@ -225,8 +223,10 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        build-tool: [buck2, cmake]
         mode: [portable, xnnpack+kv+custom, mps, coreml]
+        include:
+          - dtype: bf16
+            mode: portable
       fail-fast: false
     with:
       runner: macos-m1-stable
@@ -237,25 +237,12 @@ jobs:
       script: |
 
         DTYPE=${{ matrix.dtype }}
-        BUILD_TOOL=${{ matrix.build-tool }}
         MODE=${{ matrix.mode }}
 
-        if [[ "${BUILD_TOOL}" == "buck2" ]]; then
-          # TODO: Will add more modes that don't support buck2
-          if [[ "${MODE}" == "mps" ]]; then
-            echo "mps doesn't support buck2."
-            exit 0
-          fi
-          if [[ "${MODE}" == "coreml" ]]; then
-            echo "coreml doesn't support buck2."
-            exit 0
-          fi
-        fi
-
         bash .ci/scripts/setup-conda.sh
 
         # Setup executorch
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh cmake
 
         if [[ "${MODE}" == "mps" ]]; then
           # Install mps delegate
@@ -270,7 +257,36 @@ jobs:
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
         # Test llama2
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M cmake "${DTYPE}" "${MODE}"
+
+  # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
+  # test-llava-runner-macos:
+  #   name: test-llava-runner-macos
+  #   uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: macos-14-xlarge
+  #     python-version: '3.11'
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 900
+  #     script: |
+  #       BUILD_TOOL=cmake
+
+  #       bash .ci/scripts/setup-conda.sh
+  #       # Setup MacOS dependencies as there is no Docker support on MacOS atm
+  #       GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+
+  #       # install Llava requirements
+  #       ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
+  #       ${CONDA_RUN} bash examples/models/llava/install_requirements.sh
+
+  #       # run python unittest
+  #       ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava
+
+  #       # run e2e (export, tokenizer and runner)
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh Release
 
   test-qnn-model:
     name: test-qnn-model
@@ -278,7 +294,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        model: [dl3]
+        model: [dl3, mv3, mv2, ic4, ic3, vit]
       fail-fast: false
     with:
       runner: linux.2xlarge
@@ -293,4 +309,128 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh ${{ matrix.model }} "cmake" "qnn"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
+
+  test-coreml-model:
+    name: test-coreml-model
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: macos-m1-stable
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        BUILD_TOOL=cmake
+        BACKEND=coreml
+
+        bash .ci/scripts/setup-conda.sh
+
+        # Setup MacOS dependencies as there is no Docker support on MacOS atm
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/install_requirements.sh
+        echo "Finishing installing coreml."
+
+        # Build and test coreml model
+        MODELS=(mv3 ic4 resnet50 edsr mobilebert w2l)
+        for MODEL_NAME in "${MODELS[@]}"; do
+          echo "::group::Exporting coreml model: $MODEL_NAME"
+          PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
+          echo "::endgroup::"
+        done
+
+  test-huggingface-transformers:
+    name: test-huggingface-transformers
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    secrets: inherit
+    strategy:
+      matrix:
+        hf_model_repo: [google/gemma-2b]
+      fail-fast: false
+    with:
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.12xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        echo "::group::Set up ExecuTorch"
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
+
+        echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
+        rm -rf cmake-out
+        cmake \
+            -DCMAKE_INSTALL_PREFIX=cmake-out \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+            -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+            -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+            -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+            -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+            -DEXECUTORCH_BUILD_XNNPACK=ON \
+            -DPYTHON_EXECUTABLE=python \
+            -Bcmake-out .
+        cmake --build cmake-out -j9 --target install --config Release
+
+        echo "Build llama runner"
+        dir="examples/models/llama2"
+        cmake \
+            -DCMAKE_INSTALL_PREFIX=cmake-out \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+            -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+            -DEXECUTORCH_BUILD_XNNPACK=ON \
+            -DPYTHON_EXECUTABLE=python \
+            -Bcmake-out/${dir} \
+            ${dir}
+        cmake --build cmake-out/${dir} -j9 --config Release
+        echo "::endgroup::"
+
+        echo "::group::Set up HuggingFace Dependencies"
+        if [ -z "$SECRET_EXECUTORCH_HF_TOKEN" ]; then
+          echo "::error::SECRET_EXECUTORCH_HF_TOKEN is empty. For security reason secrets won't be accessible on forked PRs. Please make sure you submit a non-forked PR."
+          exit 1
+        fi
+        pip install -U "huggingface_hub[cli]"
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        pip install accelerate sentencepiece
+        # TODO(guangyang): Switch to use released transformers library after all required patches are included
+        pip install "git+https://github.com/huggingface/transformers.git@6cc4dfe3f1e8d421c6d6351388e06e9b123cbfe1"
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Export to ExecuTorch"
+        TOKENIZER_FILE=tokenizer.model
+        TOKENIZER_BIN_FILE=tokenizer.bin
+        ET_MODEL_NAME=et_model
+        # Fetch the file using a Python one-liner
+        DOWNLOADED_TOKENIZER_FILE_PATH=$(python -c "
+        from huggingface_hub import hf_hub_download
+        # Download the file from the Hugging Face Hub
+        downloaded_path = hf_hub_download(
+            repo_id='${{ matrix.hf_model_repo }}',
+            filename='${TOKENIZER_FILE}'
+        )
+        print(downloaded_path)
+        ")
+        if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH" ]; then
+            echo "${TOKENIZER_FILE} downloaded successfully at: $DOWNLOADED_TOKENIZER_FILE_PATH"
+            python -m extension.llm.tokenizer.tokenizer -t $DOWNLOADED_TOKENIZER_FILE_PATH -o ./${TOKENIZER_BIN_FILE}
+            ls ./tokenizer.bin
+        else
+            echo "Failed to download ${TOKENIZER_FILE} from ${{ matrix.hf_model_repo }}."
+            exit 1
+        fi
+
+        python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME}
+
+        cmake-out/examples/models/llama2/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
+        echo "::endgroup::"
diff --git a/.github/workflows/upload-test-specs.yml b/.github/workflows/upload-android-test-specs.yml
similarity index 70%
rename from .github/workflows/upload-test-specs.yml
rename to .github/workflows/upload-android-test-specs.yml
index 24119b64566..e9b1054080c 100644
--- a/.github/workflows/upload-test-specs.yml
+++ b/.github/workflows/upload-android-test-specs.yml
@@ -1,19 +1,21 @@
-name: Upload AWS Device Farm test specs
+name: Upload AWS Device Farm Android test specs
 
 on:
   pull_request:
     paths:
-      - .github/workflows/upload-test-specs.yml
-      - examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
+      - .github/workflows/upload-android-test-specs.yml
+      - extension/android/benchmark/android-llm-device-farm-test-spec.yml
   push:
     branches:
       - main
     paths:
-      - .github/workflows/upload-test-specs.yml
-      - examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
+      - .github/workflows/upload-android-test-specs.yml
+      - extension/android/benchmark/android-llm-device-farm-test-spec.yml
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  # NB: This concurency group needs to be different than the one used in android-perf, otherwise
+  # GH complains about concurrency deadlock
+  group: android-spec-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
 jobs:
@@ -27,10 +29,10 @@ jobs:
         with:
           s3-bucket: gha-artifacts
           s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifact
+            ${{ github.repository }}/${{ github.run_id }}/artifacts
           retention-days: 1
           if-no-files-found: error
-          path: examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
+          path: extension/android/benchmark/android-llm-device-farm-test-spec.yml
 
   validate-android-test-spec:
     needs: upload-android-test-spec-for-validation
@@ -41,9 +43,9 @@ jobs:
     with:
       # Just use a small model here with a minimal amount of configuration to test the spec
       models: stories110M
-      devices: samsung_galaxy_s2x
+      devices: samsung_galaxy_s22
       delegates: xnnpack
-      test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/android-llm-device-farm-test-spec.yml
+      test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/android-llm-device-farm-test-spec.yml
 
   upload-android-test-spec:
     needs: validate-android-test-spec
@@ -75,7 +77,7 @@ jobs:
 
       - name: Upload the spec to S3 ossci-android bucket
         shell: bash
-        working-directory: examples/demo-apps/android/LlamaDemo/
+        working-directory: extension/android/benchmark/
         env:
           SPEC_FILE: android-llm-device-farm-test-spec.yml
         run: |
diff --git a/.github/workflows/upload-apple-test-specs.yml b/.github/workflows/upload-apple-test-specs.yml
new file mode 100644
index 00000000000..06d20ef2beb
--- /dev/null
+++ b/.github/workflows/upload-apple-test-specs.yml
@@ -0,0 +1,95 @@
+name: Upload AWS Device Farm Apple iOS test specs
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/upload-apple-test-specs.yml
+      - examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/upload-apple-test-specs.yml
+      - examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml
+
+concurrency:
+  # NB: This concurency group needs to be different than the one used in apple-perf, otherwise
+  # GH complains about concurrency deadlock
+  group: apple-spec-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  upload-apple-test-spec-for-validation:
+    runs-on: linux.2xlarge
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Upload the spec as a GitHub artifact for validation
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifacts
+          retention-days: 1
+          if-no-files-found: error
+          path: examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml
+
+  validate-apple-test-spec:
+    needs: upload-apple-test-spec-for-validation
+    uses: ./.github/workflows/apple-perf.yml
+    secrets: inherit
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      # Just use a small model here with a minimal amount of configuration to test the spec
+      models: stories110M
+      devices: apple_iphone_15
+      delegates: xnnpack
+      test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/default-ios-device-farm-appium-test-spec.yml
+
+  upload-apple-test-spec:
+    needs: validate-apple-test-spec
+    runs-on: ubuntu-22.04
+    timeout-minutes: 15
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+          cache: pip
+
+      - name: configure aws credentials
+        uses: aws-actions/configure-aws-credentials@v1.7.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_executorch_upload-frameworks-ios
+          aws-region: us-east-1
+
+      - name: Only push to S3 when running the workflow manually from main branch
+        if: ${{ github.ref == 'refs/heads/main' }}
+        shell: bash
+        run: |
+          set -eux
+          echo "UPLOAD_ON_MAIN=1" >> "${GITHUB_ENV}"
+
+      - name: Upload the spec to S3 ossci-ios bucket
+        shell: bash
+        working-directory: examples/demo-apps/apple_ios
+        env:
+          SPEC_FILE: default-ios-device-farm-appium-test-spec.yml
+        run: |
+          set -eux
+
+          pip install awscli==1.32.18
+
+          AWS_CMD="aws s3 cp --dryrun"
+          if [[ "${UPLOAD_ON_MAIN:-0}" == "1" ]]; then
+            AWS_CMD="aws s3 cp"
+          fi
+
+          shasum -a 256 "${SPEC_FILE}"
+          ${AWS_CMD} "${SPEC_FILE}" s3://ossci-ios/executorch/ --acl public-read
diff --git a/.gitmodules b/.gitmodules
index 0999bdb9356..71ff854bb03 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -21,7 +21,7 @@
 	url = https://github.com/Maratyszcza/FXdiv.git
 [submodule "backends/xnnpack/third-party/XNNPACK"]
 	path = backends/xnnpack/third-party/XNNPACK
-	url = https://github.com/digantdesai/XNNPACK.git
+	url = https://github.com/google/XNNPACK.git
 [submodule "backends/xnnpack/third-party/cpuinfo"]
 	path = backends/xnnpack/third-party/cpuinfo
 	url = https://github.com/pytorch/cpuinfo.git
diff --git a/.lintrunner.toml b/.lintrunner.toml
index c28512c5986..7aa15d65638 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -74,6 +74,9 @@ exclude_patterns = [
     # NB: Objective-C is not supported
     'examples/apple/**',
     'examples/demo-apps/apple_ios/**',
+    # File contains @generated
+    'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
+    'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
 ]
 command = [
     'python',
@@ -177,6 +180,9 @@ exclude_patterns = [
     '**/*.bat',
     '**/*.jpg',
     '**/*.jar',
+    # File contains @generated
+    'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
+    'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
 ]
 command = [
     'python',
diff --git a/CMakeLists.txt b/CMakeLists.txt
index afb0437fae4..288bc9018ad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,6 +181,10 @@ option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
        OFF
 )
 
+option(EXECUTORCH_BUILD_EXTENSION_TENSOR "Build the Tensor extension" OFF)
+
+option(EXECUTORCH_BUILD_EXTENSION_TRAINING "Build the training extension" OFF)
+
 option(EXECUTORCH_BUILD_GTESTS "Build googletest based test binaries" OFF)
 
 option(EXECUTORCH_BUILD_MPS "Build the MPS backend" OFF)
@@ -195,7 +199,7 @@ option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)
 
 option(EXECUTORCH_BUILD_KERNELS_QUANTIZED "Build the quantized kernels" OFF)
 
-option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch SDK")
+option(EXECUTORCH_BUILD_DEVTOOLS "Build the ExecuTorch Developer Tools")
 
 option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF)
 
@@ -226,6 +230,7 @@ cmake_dependent_option(
 )
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
+  set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
   set(EXECUTORCH_BUILD_KERNELS_CUSTOM ON)
 endif()
 
@@ -505,7 +510,8 @@ if(EXECUTORCH_BUILD_PYBIND AND APPLE)
   )
   target_link_libraries(executorch_no_prim_ops_shared PRIVATE program_schema)
   if(DL_LIBRARY_EXISTS)
-    target_link_libraries(executorch_no_prim_ops_shared PRIVATE dl) # For dladdr()
+    # For dladdr()
+    target_link_libraries(executorch_no_prim_ops_shared PRIVATE dl)
   endif()
   target_include_directories(
     executorch_no_prim_ops_shared PUBLIC ${_common_include_directories}
@@ -541,17 +547,13 @@ target_link_options_shared_lib(executorch)
 # operators necessary for the models that will run.
 #
 if(BUILD_EXECUTORCH_PORTABLE_OPS)
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 endif()
 
-if(EXECUTORCH_BUILD_KERNELS_QUANTIZED)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized)
-endif()
-
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations)
 
 #
@@ -582,90 +584,77 @@ cmake_dependent_option(
   EXECUTORCH_BUILD_EXECUTOR_RUNNER "Build the executor_runner executable" ON
   EXECUTORCH_BUILD_HOST_TARGETS OFF
 )
-if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
-  # Baseline libraries that executor_runner will link against.
-  set(_executor_runner_libs executorch gflags)
-
-  if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
-    list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
-  elseif(EXECUTORCH_BUILD_CADENCE)
-    list(APPEND _executor_runner_libs cadence_ops_lib)
-  else()
-    list(APPEND _executor_runner_libs portable_ops_lib)
-  endif()
-
-  # Generate lib to register quantized ops
-  if(EXECUTORCH_BUILD_KERNELS_QUANTIZED)
-    list(APPEND _executor_runner_libs quantized_ops_lib)
-  endif()
-
-  add_executable(executor_runner ${_executor_runner__srcs})
-  if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT APPLE)
-    target_link_options(executor_runner PRIVATE "LINKER:--gc-sections")
-  endif()
-  target_link_libraries(executor_runner ${_executor_runner_libs})
-  target_compile_options(executor_runner PUBLIC ${_common_compile_options})
-endif()
 
 # Add googletest if any test targets should be built
 if(EXECUTORCH_BUILD_GTESTS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/googletest)
 endif()
 
-if(EXECUTORCH_BUILD_SDK)
-  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
-      ON
-      CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
-  )
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk)
+if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_APPLE)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
+if(EXECUTORCH_BUILD_CADENCE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
+if(EXECUTORCH_BUILD_COREML)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_MODULE)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
+if(EXECUTORCH_BUILD_MPS)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps)
 endif()
 
 if(EXECUTORCH_BUILD_NEURON)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/mediatek)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
+if(EXECUTORCH_BUILD_QNN)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm)
 endif()
 
 if(EXECUTORCH_BUILD_XNNPACK)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
 endif()
 
-if(EXECUTORCH_BUILD_VULKAN)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan)
+if(EXECUTORCH_BUILD_DEVTOOLS)
+  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
+      ON
+      CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
+  )
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
 endif()
 
-if(EXECUTORCH_BUILD_QNN)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm)
+if(EXECUTORCH_BUILD_EXTENSION_APPLE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
 endif()
 
-if(EXECUTORCH_BUILD_ARM_BAREMETAL)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
+if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
 endif()
 
-if(EXECUTORCH_BUILD_MPS)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps)
+if(EXECUTORCH_BUILD_EXTENSION_MODULE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
 
-if(EXECUTORCH_BUILD_COREML)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml)
+if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
 endif()
 
-if(EXECUTORCH_BUILD_CADENCE)
-   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence)
+if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
+endif()
+
+if(EXECUTORCH_BUILD_EXTENSION_TENSOR)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/tensor)
+endif()
+
+if(EXECUTORCH_BUILD_PTHREADPOOL
+   AND EXECUTORCH_BUILD_CPUINFO
+   AND CMAKE_CXX_STANDARD GREATER_EQUAL 14
+)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
 endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
@@ -675,8 +664,8 @@ if(EXECUTORCH_BUILD_PYBIND)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
   endif()
 
-  if(NOT EXECUTORCH_BUILD_SDK)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk)
+  if(NOT EXECUTORCH_BUILD_DEVTOOLS)
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
   endif()
 
   # find pytorch lib, to allow pybind to take at::Tensor as input/output
@@ -691,11 +680,16 @@ if(EXECUTORCH_BUILD_PYBIND)
       etdump
       executorch
       extension_data_loader
-      portable_ops_lib
       util
       torch
   )
 
+  if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+    list(APPEND _dep_libs optimized_native_cpu_ops_lib)
+  else()
+    list(APPEND _dep_libs portable_ops_lib)
+  endif()
+
   if(EXECUTORCH_BUILD_COREML)
     list(APPEND _dep_libs coremldelegate)
   endif()
@@ -710,10 +704,6 @@ if(EXECUTORCH_BUILD_PYBIND)
     list(APPEND _dep_libs xnnpack_backend XNNPACK)
   endif()
 
-  if(EXECUTORCH_BUILD_KERNELS_QUANTIZED)
-    target_link_options_shared_lib(quantized_ops_lib)
-  endif()
-
   # compile options for pybind
   set(_pybind_compile_options
       -Wno-deprecated-declarations
@@ -726,10 +716,8 @@ if(EXECUTORCH_BUILD_PYBIND)
   )
   # util lib
   add_library(
-    util
-    ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/extension/aten_util/aten_bridge.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/util/read_file.cpp
+    util ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/extension/aten_util/aten_bridge.cpp
   )
   target_include_directories(
     util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
@@ -778,12 +766,14 @@ if(EXECUTORCH_BUILD_PYBIND)
   else()
     set_target_properties(
       portable_lib
-      PROPERTIES # Assume <executorch> is the root `site-packages/executorch`
-                 # Need to add <executorch>/extension/llm/custom_ops for
-                 # libcustom_ops_aot_lib
-                 # Need to add <executorch>/kernels/quantized for
-                 # libquantized_ops_aot_lib
-                 BUILD_RPATH "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized"
+      PROPERTIES
+        # Assume <executorch> is the root `site-packages/executorch`
+        # Need to add <executorch>/extension/llm/custom_ops for
+        # libcustom_ops_aot_lib
+        # Need to add <executorch>/kernels/quantized for
+        # libquantized_ops_aot_lib
+        BUILD_RPATH
+        "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized"
     )
   endif()
 
@@ -794,9 +784,45 @@ endif()
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops
-  )
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops)
+endif()
+
+if(EXECUTORCH_BUILD_KERNELS_QUANTIZED)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized)
+  target_link_options_shared_lib(quantized_ops_lib)
+endif()
+
+if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
+  # Baseline libraries that executor_runner will link against.
+  set(_executor_runner_libs executorch gflags)
+
+  if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+    list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
+  elseif(EXECUTORCH_BUILD_CADENCE)
+    list(APPEND _executor_runner_libs cadence_ops_lib)
+  else()
+    list(APPEND _executor_runner_libs portable_ops_lib)
+  endif()
+
+  # Generate lib to register quantized ops
+  if(EXECUTORCH_BUILD_KERNELS_QUANTIZED)
+    list(APPEND _executor_runner_libs quantized_ops_lib)
+  endif()
+
+  add_executable(executor_runner ${_executor_runner__srcs})
+  if(CMAKE_BUILD_TYPE STREQUAL "Release")
+    if(APPLE)
+      target_link_options(executor_runner PRIVATE "LINKER:-dead_strip")
+    else()
+      target_link_options(executor_runner PRIVATE "LINKER:--gc-sections")
+    endif()
+  endif()
+  target_link_libraries(executor_runner ${_executor_runner_libs})
+  target_compile_options(executor_runner PUBLIC ${_common_compile_options})
+endif()
+
+if(EXECUTORCH_BUILD_VULKAN)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan)
 endif()
 
 # Print all summary
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2ad23f84d17..d434c1fe198 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -131,9 +131,7 @@ for detailed advice.
 
 #### C++ language version
 
-**C++11.**
-
-NOTE: The code does not yet fully conform to this, and some files require C++17.
+**C++17.**
 
 Rationale: This is a compromise between being compatible with older, proprietary
 toolchains, and having access to relatively modern C++ features.
diff --git a/README.md b/README.md
index c4e6e0caf75..0e78f4da356 100644
--- a/README.md
+++ b/README.md
@@ -10,9 +10,9 @@ Key value propositions of ExecuTorch are:
 - **Portability:** Compatibility with a wide variety of computing platforms,
   from high-end mobile phones to highly constrained embedded systems and
   microcontrollers.
-- **Productivity:** Enabling developers to use the same toolchains and SDK from
-  PyTorch model authoring and conversion, to debugging and deployment to a wide
-  variety of platforms.
+- **Productivity:** Enabling developers to use the same toolchains and Developer
+  Tools from PyTorch model authoring and conversion, to debugging and deployment
+  to a wide variety of platforms.
 - **Performance:** Providing end users with a seamless and high-performance
   experience due to a lightweight runtime and utilizing full hardware
   capabilities such as CPUs, NPUs, and DSPs.
@@ -22,6 +22,8 @@ please visit our documentation website [for the latest release](https://pytorch.
 
 Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) page for a quick spin.
 
+Check out the examples of [Llama](./examples/models/llama2/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
+
 ## Feedback
 
 We welcome any feedback, suggestions, and bug reports from the community to help
@@ -93,7 +95,7 @@ tools.
 ├── schema                          #  ExecuTorch PTE file format flatbuffer
 schemas.
 ├── scripts                         #  Utility scripts for size management, dependency management, etc.
-├── sdk                             #  Model profiling, debugging, and introspection.
+├── devtools                        #  Model profiling, debugging, and introspection.
 ├── shim                            #  Compatibility layer between OSS and Internal builds
 ├── test                            #  Broad scoped end-to-end tests.
 ├── third-party                     #  Third-party dependencies.
diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt
index 113b21bd690..27e09b3f581 100644
--- a/backends/apple/coreml/CMakeLists.txt
+++ b/backends/apple/coreml/CMakeLists.txt
@@ -13,11 +13,11 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 endif()
 
-if(EXECUTORCH_BUILD_SDK)
-# protobuf requires frtti
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frtti" )
+if(EXECUTORCH_BUILD_DEVTOOLS)
+  # protobuf requires frtti
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frtti")
 endif()
-  
+
 option(COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." OFF)
 
 # inmemoryfs sources
@@ -136,7 +136,7 @@ target_include_directories(
 target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..)
 target_link_libraries(coremldelegate PRIVATE executorch_no_prim_ops)
 
-if(EXECUTORCH_BUILD_SDK)
+if(EXECUTORCH_BUILD_DEVTOOLS)
   target_sources(coremldelegate PRIVATE ${SDK_SOURCES} ${PROTOBUF_SOURCES})
   target_include_directories(
     coremldelegate
@@ -174,7 +174,7 @@ endif()
 target_compile_options(coremldelegate PRIVATE "-fobjc-arc")
 target_compile_options(coremldelegate PRIVATE "-fno-exceptions")
 
-if(EXECUTORCH_BUILD_SDK)
+if(EXECUTORCH_BUILD_DEVTOOLS)
   target_compile_options(
     executorch_no_prim_ops PUBLIC -DET_EVENT_TRACER_ENABLED
   )
diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py
index 375fdf406b2..5084405c468 100644
--- a/backends/apple/coreml/compiler/coreml_preprocess.py
+++ b/backends/apple/coreml/compiler/coreml_preprocess.py
@@ -3,6 +3,7 @@
 # CoreML backend for delegating a EdgeProgram to CoreML.
 
 import json
+import logging
 
 import shutil
 import uuid
@@ -14,6 +15,7 @@
 from typing import Any, Dict, final, List, Optional, Tuple
 
 import coremltools as ct
+import coremltools.optimize as cto
 import executorchcoreml
 
 from executorch.exir.backend.backend_details import (
@@ -23,12 +25,16 @@
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
 
 class COMPILE_SPEC_KEYS(Enum):
     COMPUTE_UNITS = "compute_units"
     MODEL_TYPE = "model_type"
     MIN_DEPLOYMENT_TARGET = "min_deployment_target"
     MODEL_COMPUTE_PRECISION = "model_compute_precision"
+    OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config"
 
 
 class MODEL_PATHS(Enum):
@@ -169,12 +175,44 @@ def generate_compute_unit_compile_spec(
             compute_unit.name.lower().encode("utf-8"),
         )
 
+    @staticmethod
+    def generate_op_linear_quantizer_config_compile_spec(
+        op_linear_quantizer_config: Dict,
+    ) -> CompileSpec:
+        """
+        Returns the compile spec representing the model post conversion quantization,
+        which is a dict that will construct cto.coreml.OpLinearQuantizerConfig
+        """
+        str_representation = json.dumps(op_linear_quantizer_config)
+        byte_representation = str_representation.encode("utf-8")
+        return CompileSpec(
+            COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value,
+            byte_representation,
+        )
+
+    @staticmethod
+    def op_linear_quantizer_config_from_compile_specs(
+        compile_specs: List[CompileSpec],
+    ) -> cto.coreml.OpLinearQuantizerConfig:
+        """
+        Returns the model's post conversion quantization by parsing the list of compile specs.
+        """
+        for compile_spec in compile_specs:
+            if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value:
+                config_dict_str = compile_spec.value.decode("utf-8")
+                config_dict = json.loads(config_dict_str)
+                config = cto.coreml.OpLinearQuantizerConfig._from_dict(config_dict)
+                return config
+
+        return None
+
     @staticmethod
     def generate_compile_specs(
         compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL,
         minimum_deployment_target: ct.target = ct.target.iOS15,
         compute_precision: ct.precision = ct.precision.FLOAT16,
         model_type: MODEL_TYPE = MODEL_TYPE.MODEL,
+        op_linear_quantizer_config: Optional[Dict] = None,
     ) -> List[CompileSpec]:
         """
         Returns the list of compile specs that's used by CoreMLBackend to lower the module.
@@ -192,6 +230,12 @@ def generate_compile_specs(
             CoreMLBackend.generate_compute_precision_compile_spec(compute_precision)
         )
         compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type))
+        if op_linear_quantizer_config is not None:
+            compile_specs.append(
+                CoreMLBackend.generate_op_linear_quantizer_config_compile_spec(
+                    op_linear_quantizer_config
+                )
+            )
 
         return compile_specs
 
@@ -368,18 +412,18 @@ def preprocess(
                 compile_specs,
             )
         )
-
         model_compute_precision: ct.precision = (
             CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs)
         )
-
         minimum_deployment_target: ct.target = (
             CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs)
         )
-
         compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs(
             compile_specs
         )
+        op_linear_quantizer_config = (
+            CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs)
+        )
 
         mlmodel = ct.convert(
             model=edge_program,
@@ -392,4 +436,15 @@ def preprocess(
             compute_units=compute_units,
         )
 
+        if op_linear_quantizer_config is not None:
+            logger.warning(
+                "Core ML Backend op_linear_quantizer_config API is experimental"
+            )
+            config = cto.coreml.OptimizationConfig(
+                global_config=op_linear_quantizer_config,
+                # skip embedding
+                op_type_configs={"gather": None},
+            )
+            mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config)
+
         return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type)
diff --git a/backends/apple/coreml/partition/coreml_partitioner.py b/backends/apple/coreml/partition/coreml_partitioner.py
index ecf6d44b19c..c0b6663f729 100644
--- a/backends/apple/coreml/partition/coreml_partitioner.py
+++ b/backends/apple/coreml/partition/coreml_partitioner.py
@@ -17,7 +17,7 @@
     Partitioner,
     PartitionResult,
 )
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
 from torch.export.exported_program import ExportedProgram
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
 from torch.fx.passes.operator_support import OperatorSupportBase
@@ -61,6 +61,7 @@ def __init__(
         self,
         skip_ops_for_coreml_delegation: Optional[List[str]] = None,
         compile_specs: Optional[List[CompileSpec]] = None,
+        take_over_mutable_buffer: Optional[bool] = True,
     ) -> None:
         if skip_ops_for_coreml_delegation is None:
             skip_ops_for_coreml_delegation = []
@@ -69,6 +70,7 @@ def __init__(
             backend_id=CoreMLBackend.__name__,
             compile_specs=compile_specs if compile_specs is not None else [],
         )
+        self.take_over_mutable_buffer = take_over_mutable_buffer
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         # Run the CapabilityBasedPartitioner to return the largest possible
@@ -89,6 +91,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
                 partition_tags[tag] = self.delegation_spec
 
         tag_constant_data(exported_program)
+        if self.take_over_mutable_buffer:
+            logger.info(
+                "Core ML partitioner will take over torch mutable buffer as Core ML state, "
+                "so if your model contains mutable buffer, "
+                "then you will need MacOS15+/iOS18+ to execute. "
+                "If you want your mutable buffer model to be compatible with older OS, "
+                "then please set `take_over_mutable_buffer=False`"
+            )
+            tag_mutated_buffer(exported_program)
 
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm
index 57316e28015..226307f3c8f 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm
@@ -29,9 +29,10 @@ - (instancetype)initWithModel:(ETCoreMLModel *)model {
     if (self.ignoreOutputBackings) {
         predictionOptions.outputBackings = @{};
     }
-    id<MLFeatureProvider> outputs = [self.model.mlModel predictionFromFeatures:inputs
-                                                                       options:predictionOptions
-                                                                         error:error];
+
+    id<MLFeatureProvider> outputs = [self.model predictionFromFeatures:inputs
+                                                               options:predictionOptions
+                                                                 error:error];
     if (!outputs) {
         return nil;
     }
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h
index 9bf3183e65a..58026593462 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h
@@ -37,15 +37,12 @@ __attribute__((objc_subclassing_restricted))
                     orderedOutputNames:(NSOrderedSet<NSString*>*)orderedOutputNames
                                  error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER;
 
-- (nullable NSArray<MLMultiArray*>*)prepareInputs:(const std::vector<executorchcoreml::MultiArray>&)inputs
-                                            error:(NSError* __autoreleasing*)error;
-
-- (nullable NSArray<MLMultiArray*>*)prepareOutputBackings:(const std::vector<executorchcoreml::MultiArray>&)outputs
-                                                    error:(NSError* __autoreleasing*)error;
-
 /// The underlying MLModel.
 @property (strong, readonly, nonatomic) MLModel* mlModel;
 
+/// The model state.
+@property (strong, readonly, nonatomic, nullable) id state;
+
 /// The asset from which the model is loaded.
 @property (strong, readonly, nonatomic) ETCoreMLAsset* asset;
 
@@ -58,6 +55,19 @@ __attribute__((objc_subclassing_restricted))
 /// The ordered output names of the model.
 @property (copy, readonly, nonatomic) NSOrderedSet<NSString*>* orderedOutputNames;
 
+
+- (nullable id<MLFeatureProvider>)predictionFromFeatures:(id<MLFeatureProvider>)input
+                                                 options:(MLPredictionOptions*)options
+                                                   error:(NSError* __autoreleasing*)error;
+
+- (nullable NSArray<MLMultiArray*>*)prepareInputs:(const std::vector<executorchcoreml::MultiArray>&)inputs
+                                            error:(NSError* __autoreleasing*)error;
+
+- (nullable NSArray<MLMultiArray*>*)prepareOutputBackings:(const std::vector<executorchcoreml::MultiArray>&)outputs
+                                                    error:(NSError* __autoreleasing*)error;
+
+- (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error;
+
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
index ee7218bd271..6b39ae5f920 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
@@ -7,10 +7,12 @@
 
 #import <ETCoreMLModel.h>
 
-#import <ETCoreMLAsset.h>
+#import "ETCoreMLAsset.h"
+#import "ETCoreMLLogging.h"
+#import "multiarray.h"
+#import "objc_array_util.h"
+#import "MLModel_Prewarm.h"
 #import <functional>
-#import <objc_array_util.h>
-#import <multiarray.h>
 #import <numeric>
 
 #pragma mark - ETCoreMLMultiArrayDescriptor
@@ -155,6 +157,19 @@ size_t get_number_of_bytes(MLMultiArrayDataType data_type) {
     return get_multi_array_constraints_by_name(description.outputDescriptionsByName);
 }
 
+#if MODEL_STATE_IS_SUPPORTED
+API_AVAILABLE(macos(15.0), ios(18.0), tvos(18.0), watchos(11.0))
+void reset_state_for_feature_name(NSString *feature_name, MLState *state) {
+    [state getMultiArrayForStateNamed:feature_name handler:^(MLMultiArray *buffer) {
+        [buffer getMutableBytesWithHandler:^(void *mutableBytes, NSInteger size, NSArray<NSNumber *> * __unused strides) {
+            uint8_t *start = reinterpret_cast<uint8_t *>(mutableBytes);
+            uint8_t *end = start + size;
+            std::fill(start, end, uint8_t(0));
+        }];
+    }];
+}
+#endif
+
 }
 
 #pragma mark - ETCoreMLModel
@@ -194,6 +209,11 @@ - (nullable instancetype)initWithAsset:(ETCoreMLAsset *)asset
         _cache = [[NSCache alloc] init];
         _inputConstraintsByName = get_multi_array_input_constraints_by_name(mlModel.modelDescription);
         _outputConstraintsByName = get_multi_array_output_constraints_by_name(mlModel.modelDescription);
+#if MODEL_STATE_IS_SUPPORTED
+        if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) {
+            _state = mlModel.modelDescription.stateDescriptionsByName.count > 0 ? [_mlModel newState] : nil;
+        }
+#endif
     }
     
     return self;
@@ -272,4 +292,52 @@ MultiArray buffer(mutableBytes, MultiArray::MemoryLayout(to_multiarray_data_type
     
 }
 
+- (nullable id<MLFeatureProvider>)predictionFromFeatures:(id<MLFeatureProvider>)input
+                                                 options:(MLPredictionOptions *)options
+                                                   error:(NSError **)error {
+#if MODEL_STATE_IS_SUPPORTED
+    if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) {
+        if (self.state != nil) {
+            return [self.mlModel predictionFromFeatures:input
+                                             usingState:(MLState *)self.state
+                                                options:options
+                                                  error:error];
+        }
+    }
+#endif
+
+    id<MLFeatureProvider> result = [self.mlModel predictionFromFeatures:input
+                                                                options:options
+                                                                  error:error];
+
+    return result;
+}
+
+- (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error {
+    NSError *localError = nil;
+    BOOL result = [self.mlModel prewarmUsingState:self.state error:error];
+    if (!result) {
+        ETCoreMLLogError(localError,
+                         "%@: Failed to prewarm model with identifier = %@",
+                         NSStringFromClass(self.class),
+                         self.identifier);
+    }
+
+#if MODEL_STATE_IS_SUPPORTED
+    if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) {
+        NSDictionary<NSString *, MLFeatureDescription *> *stateDescriptions = self.mlModel.modelDescription.stateDescriptionsByName;
+        [stateDescriptions enumerateKeysAndObjectsUsingBlock:^(NSString *featureName, MLFeatureDescription * __unused obj, BOOL * __unused stop) {
+            reset_state_for_feature_name(featureName, (MLState *) self.state);
+        }];
+    }
+#endif
+
+
+    if (error) {
+        *error = localError;
+    }
+
+    return result;
+}
+
 @end
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
index 8d6d537385b..cd0fbc86f99 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
@@ -598,21 +598,8 @@ - (BOOL)prewarmModelWithHandle:(ModelHandle *)handle
     if (!model) {
         return NO;
     }
-    
-    NSError *localError = nil;
-    BOOL result = [model.mlModel prewarmAndReturnError:&localError];
-    if (!result) {
-        ETCoreMLLogError(localError,
-                         "%@: Failed to prewarm model with identifier = %@",
-                         NSStringFromClass(self.assetManager.class),
-                         model.identifier);
-    }
-    
-    if (error) {
-        *error = localError;
-    }
-    
-    return result;
+
+    return [model prewarmAndReturnError:error];
 }
 
 - (void)prewarmRecentlyUsedAssetsWithMaxCount:(NSUInteger)maxCount {
@@ -682,16 +669,15 @@ - (void)addPrewarmedAsset:(ETCoreMLAsset *)asset {
                                                                        error:&localError];
     // Try without output backings.
     if (!modelOutputs && predictionOptions.outputBackings.count > 0) {
-        localError = nil;
         executor.ignoreOutputBackings = YES;
+        localError = nil;
+        modelOutputs = [executor executeModelWithInputs:inputFeatures
+                                      predictionOptions:predictionOptions
+                                         loggingOptions:loggingOptions
+                                            eventLogger:eventLogger
+                                                  error:&localError];
     }
-    
-    modelOutputs = [executor executeModelWithInputs:inputFeatures
-                                  predictionOptions:predictionOptions
-                                     loggingOptions:loggingOptions
-                                        eventLogger:eventLogger
-                                              error:&localError];
-    
+
     if (error) {
         *error = localError;
     }
diff --git a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h
index c066608b893..6caf99507dc 100644
--- a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h
+++ b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h
@@ -8,6 +8,9 @@
 
 #import <CoreML/CoreML.h>
 
+#if !defined(MODEL_STATE_IS_SUPPORTED) && __has_include(<CoreML/MLModel+MLState.h>)
+#define MODEL_STATE_IS_SUPPORTED 1
+#endif
 
 NS_ASSUME_NONNULL_BEGIN
 
@@ -15,9 +18,10 @@ NS_ASSUME_NONNULL_BEGIN
 
 /// Pre-warms the model by running a prediction with zeroed-out inputs.
 ///
+/// @param state The model state.
 /// @param error   On failure, error is filled with the failure information.
 /// @retval `YES` if the prediction succeeded otherwise `NO`.
-- (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error;
+- (BOOL)prewarmUsingState:(nullable id)state error:(NSError* __autoreleasing*)error;
 
 @end
 
diff --git a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm
index 71ce967ac3e..d6f59666cf0 100644
--- a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm
+++ b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm
@@ -71,16 +71,28 @@ + (MLMultiArray *)zeroedMultiArrayWithShape:(NSArray<NSNumber *> *)shape
 
 @implementation MLModel (Prewarm)
 
-- (BOOL)prewarmAndReturnError:(NSError * __autoreleasing *)error {
+- (BOOL)prewarmUsingState:(nullable id)state error:(NSError * __autoreleasing *)error {
     @autoreleasepool {
         id<MLFeatureProvider> inputs = ::get_zeroed_inputs(self, error);
         if (!inputs) {
             return NO;
         }
-        
-        id<MLFeatureProvider> outputs = [self predictionFromFeatures:inputs error:error];
+
+
+        id<MLFeatureProvider> outputs = nil;
+        if (state != nil) {
+#if MODEL_STATE_IS_SUPPORTED
+            if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) {
+                outputs = [self predictionFromFeatures:inputs usingState:(MLState *)state error:error];
+                return outputs != nil;
+            }
+#endif
+        }
+
+        outputs = [self predictionFromFeatures:inputs error:error];
         return outputs != nil;
     }
 }
 
+
 @end
diff --git a/backends/apple/coreml/runtime/include/coreml_backend/delegate.h b/backends/apple/coreml/runtime/include/coreml_backend/delegate.h
index a11d41bf7f4..1943e0f05b0 100644
--- a/backends/apple/coreml/runtime/include/coreml_backend/delegate.h
+++ b/backends/apple/coreml/runtime/include/coreml_backend/delegate.h
@@ -20,7 +20,7 @@ class BackendDelegate;
 namespace torch {
 namespace executor {
 
-class CoreMLBackendDelegate final : public PyTorchBackendInterface {
+class CoreMLBackendDelegate final : public ::executorch::runtime::BackendInterface {
 public:
     CoreMLBackendDelegate() noexcept;
     ~CoreMLBackendDelegate() = default;
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm
index 1740faf00e6..988b5d808a0 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm
@@ -88,10 +88,9 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod
                                                  eventLogger:(const executorchcoreml::ModelEventLogger *)eventLogger
                                                        error:(NSError * __autoreleasing *)error {
     if (self.profiler == nil) {
-        ETCoreMLModelProfiler *profiler = [[ETCoreMLModelProfiler alloc] initWithCompiledModelAsset:self.model.asset
-                                                                                        outputNames:self.model.orderedOutputNames
-                                                                                      configuration:self.configuration
-                                                                                              error:error];
+        ETCoreMLModelProfiler *profiler = [[ETCoreMLModelProfiler alloc] initWithModel:self.model
+                                                                         configuration:self.configuration
+                                                                                 error:error];
         self.profiler = profiler;
     }
        
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h
index 07a384a5167..7a43a30d752 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h
@@ -31,14 +31,12 @@ __attribute__((objc_subclassing_restricted))
 
 /// Constructs an `ETCoreMLModelProfiler` instance.
 ///
-/// @param compiledModelAsset The compiled model asset (mlmodelc).
-/// @param outputNames The model output names.
+/// @param model The model.
 /// @param configuration The model configuration.
 /// @param error   On failure, error is filled with the failure information.
-- (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset*)compiledModelAsset
-                                        outputNames:(NSOrderedSet<NSString*>*)outputNames
-                                      configuration:(MLModelConfiguration*)configuration
-                                              error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER;
+- (nullable instancetype)initWithModel:(ETCoreMLModel*)model
+                         configuration:(MLModelConfiguration*)configuration
+                                 error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER;
 
 /// Returns profiling info of operations at the specified paths.
 ///
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm
index c9ad324a6c0..5998701eb0f 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm
@@ -8,6 +8,7 @@
 #import "ETCoreMLModelProfiler.h"
 
 #import "ETCoreMLAsset.h"
+#import "ETCoreMLModel.h"
 #import "ETCoreMLLogging.h"
 #import "ETCoreMLModelStructurePath.h"
 #import "ETCoreMLOperationProfilingInfo.h"
@@ -221,8 +222,8 @@ void set_model_outputs(id<MLFeatureProvider> output_features,
 }
 
 @interface ETCoreMLModelProfiler ()
-/// The CoreML model.
-@property (readonly, strong, nonatomic) MLModel *model;
+/// The model.
+@property (readonly, strong, nonatomic) ETCoreMLModel *model;
 /// The model output names.
 @property (readonly, copy, nonatomic) NSOrderedSet<NSString *> *outputNames;
 #if MODEL_PROFILING_IS_AVAILABLE
@@ -240,25 +241,19 @@ @interface ETCoreMLModelProfiler ()
 
 @implementation ETCoreMLModelProfiler
 
-- (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledModelAsset
-                                        outputNames:(NSOrderedSet<NSString *> *)outputNames
-                                      configuration:(MLModelConfiguration *)configuration
-                                              error:(NSError * __autoreleasing *)error  {
+- (nullable instancetype)initWithModel:(ETCoreMLModel *)model
+                         configuration:(MLModelConfiguration *)configuration
+                                 error:(NSError * __autoreleasing *)error  {
 #if MODEL_PROFILING_IS_AVAILABLE
     if (@available(macOS 14.4, iOS 17.4, tvOS 17.4, watchOS 10.4, *)) {
-        NSURL *compiledModelURL = compiledModelAsset.contentURL;
+        NSURL *compiledModelURL = model.asset.contentURL;
         MLComputePlan *computePlan = get_compute_plan_of_model_at_url(compiledModelURL,
                                                                       configuration,
                                                                       error);
         if (!computePlan) {
             return nil;
         }
-        
-        MLModel *model = [MLModel modelWithContentsOfURL:compiledModelURL error:error];
-        if (!model) {
-            return nil;
-        }
-        
+
         __block NSMutableArray<ETCoreMLModelStructurePath *> *operationPaths = [NSMutableArray array];
         __block NSMutableDictionary<NSValue *, ETCoreMLModelStructurePath *> *operationToPathMap = [NSMutableDictionary dictionary];
         __block NSMutableArray<MLModelStructureProgramOperation *> *topologicallySortedOperations = [NSMutableArray new];
@@ -280,7 +275,6 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod
         
         self = [super init];
         if (self) {
-            _outputNames = [outputNames copy];
             _model = model;
             _computePlan = computePlan;
             _operationToPathMap = operationToPathMap;
diff --git a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
index d7218905fc2..691d4d726ed 100644
--- a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
+++ b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
@@ -13,6 +13,8 @@
 #import <executorch/runtime/platform/runtime.h>
 #import <string>
 
+#import "MLModel_Prewarm.h"
+
 static constexpr size_t kRuntimeMemorySize = 50 * 1024U * 1024U; // 50 MB
 
 using namespace torch::executor;
@@ -184,20 +186,28 @@ - (void)executeModelAtURL:(NSURL *)modelURL nLoads:(NSUInteger)nLoads nExecution
 - (void)testAddProgramExecute {
     NSURL *modelURL = [[self class] bundledResourceWithName:@"add_coreml_all" extension:@"pte"];
     XCTAssertNotNil(modelURL);
-    [self executeModelAtURL:modelURL nLoads:5 nExecutions:2];
+    [self executeModelAtURL:modelURL nLoads:1 nExecutions:2];
 }
 
 - (void)testMulProgramExecute {
     NSURL *modelURL = [[self class] bundledResourceWithName:@"mul_coreml_all" extension:@"pte"];
     XCTAssertNotNil(modelURL);
-    [self executeModelAtURL:modelURL nLoads:5 nExecutions:2];
+    [self executeModelAtURL:modelURL nLoads:1 nExecutions:2];
 }
 
 - (void)testMV3ProgramExecute {
     NSURL *modelURL = [[self class] bundledResourceWithName:@"mv3_coreml_all" extension:@"pte"];
     XCTAssertNotNil(modelURL);
-    [self executeModelAtURL:modelURL nLoads:5 nExecutions:2];
+    [self executeModelAtURL:modelURL nLoads:1 nExecutions:2];
+}
+
+#if MODEL_STATE_IS_SUPPORTED
+- (void)testStateProgramExecute {
+    NSURL *modelURL = [[self class] bundledResourceWithName:@"state_coreml_all" extension:@"pte"];
+    XCTAssertNotNil(modelURL);
+    [self executeModelAtURL:modelURL nLoads:1 nExecutions:2];
 }
+#endif
 
 - (void)executeMultipleModelsConcurrently:(NSArray<NSURL *> *)modelURLs
                                    nLoads:(NSUInteger)nLoads
diff --git a/backends/apple/coreml/runtime/test/export_stateful_model.py b/backends/apple/coreml/runtime/test/export_stateful_model.py
new file mode 100644
index 00000000000..61d1a93980f
--- /dev/null
+++ b/backends/apple/coreml/runtime/test/export_stateful_model.py
@@ -0,0 +1,77 @@
+# Copyright © 2024 Apple Inc. All rights reserved.
+#
+# Please refer to the license found in the LICENSE file in the root directory of the source tree.
+
+import os
+from pathlib import Path
+
+import coremltools as ct
+import executorch.exir as exir
+
+import torch
+
+from executorch.backends.apple.coreml.compiler import CoreMLBackend
+from executorch.backends.apple.coreml.partition import CoreMLPartitioner
+from torch.export import export
+
+
+class StatefulModel(torch.nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        max_seq_len: int,
+    ):
+        super().__init__()
+        self.register_buffer(
+            "cache", torch.zeros((max_seq_len, embedding_dim), dtype=torch.float32)
+        )
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k_val: torch.Tensor,
+        input_pos: torch.Tensor,
+    ):
+        q_T = q.transpose(0, 1)
+        k = torch.ops.aten.index_put_(self.cache, [input_pos, None], k_val)
+        attn = k.mm(q_T)
+        return attn
+
+
+def main() -> None:
+    embedding_dim = 3
+    max_seq_len = 2
+    model = StatefulModel(embedding_dim=embedding_dim, max_seq_len=max_seq_len)
+    example_inputs = (
+        torch.randn((1, embedding_dim)),
+        torch.randn((1, embedding_dim)),
+        torch.tensor([0]),
+    )
+    exported_model = export(model, example_inputs)
+    edge_program_manager = exir.to_edge(exported_model)
+    compile_specs = CoreMLBackend.generate_compile_specs(
+        compute_precision=ct.precision.FLOAT16,
+        compute_unit=ct.ComputeUnit.ALL,
+        minimum_deployment_target=ct.target.iOS18,
+    )
+
+    partitioner = CoreMLPartitioner(
+        skip_ops_for_coreml_delegation=None,
+        compile_specs=compile_specs,
+    )
+
+    delegated_program_manager = edge_program_manager.to_backend(partitioner)
+    exec_program = delegated_program_manager.to_executorch(
+        config=exir.ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+
+    buffer = exec_program.buffer
+    models_dir = Path(os.path.dirname(os.path.realpath(__file__))) / "models"
+    models_dir.mkdir(parents=False, exist_ok=True)
+    file_path = models_dir / "state_coreml_all.pte"
+    with open(file_path.resolve(), "wb") as file:
+        file.write(buffer)
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
index c347c56db03..2daa5615ba9 100644
--- a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
+++ b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
@@ -7,6 +7,7 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
+		8307EB8A2C9262060011AE6D /* state_coreml_all.pte in Resources */ = {isa = PBXBuildFile; fileRef = 8307EB892C9262060011AE6D /* state_coreml_all.pte */; };
 		83BB78A02C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm in Sources */ = {isa = PBXBuildFile; fileRef = 83BB789F2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm */; };
 		83BB78BF2C66AAAE00274ED7 /* add_mul_coreml_all.bin in Resources */ = {isa = PBXBuildFile; fileRef = 83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */; };
 		83BB78C02C66AAAE00274ED7 /* add_mul_coreml_all.pte in Resources */ = {isa = PBXBuildFile; fileRef = 83BB78BE2C66AAAE00274ED7 /* add_mul_coreml_all.pte */; };
@@ -120,6 +121,7 @@
 /* End PBXCopyFilesBuildPhase section */
 
 /* Begin PBXFileReference section */
+		8307EB892C9262060011AE6D /* state_coreml_all.pte */ = {isa = PBXFileReference; lastKnownFileType = file; name = state_coreml_all.pte; path = ../test/models/state_coreml_all.pte; sourceTree = "<group>"; };
 		83BB789E2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = ETCoreMLModelDebugInfo.h; path = ../sdk/ETCoreMLModelDebugInfo.h; sourceTree = "<group>"; };
 		83BB789F2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; name = ETCoreMLModelDebugInfo.mm; path = ../sdk/ETCoreMLModelDebugInfo.mm; sourceTree = "<group>"; };
 		83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = add_mul_coreml_all.bin; path = ../test/models/add_mul_coreml_all.bin; sourceTree = "<group>"; };
@@ -607,6 +609,7 @@
 				C98551982AD2542D009143F9 /* mv3_coreml_all.pte */,
 				83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */,
 				83BB78BE2C66AAAE00274ED7 /* add_mul_coreml_all.pte */,
+				8307EB892C9262060011AE6D /* state_coreml_all.pte */,
 			);
 			name = models;
 			sourceTree = "<group>";
@@ -677,6 +680,7 @@
 				C985519E2AD2542D009143F9 /* mv3_coreml_all.pte in Resources */,
 				C98551A02AD2542D009143F9 /* add_coreml_all.bin in Resources */,
 				C98551A22AD2542D009143F9 /* mul_coreml_all.pte in Resources */,
+				8307EB8A2C9262060011AE6D /* state_coreml_all.pte in Resources */,
 				C98551A32AD2542D009143F9 /* add_coreml_all.pte in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
diff --git a/backends/apple/coreml/scripts/generate_test_models.sh b/backends/apple/coreml/scripts/generate_test_models.sh
index bbe9809ff8d..0c1822aa828 100755
--- a/backends/apple/coreml/scripts/generate_test_models.sh
+++ b/backends/apple/coreml/scripts/generate_test_models.sh
@@ -17,14 +17,17 @@ cd "$EXECUTORCH_ROOT_PATH"
 
 mkdir "$COREML_DIR_PATH/runtime/test/models/"
 #Generate models
-echo "Executorch: Generating test models"
 cd "$EXECUTORCH_ROOT_PATH"
 
 MODELS=("add" "add_mul" "mul" "mv3")
 for MODEL in "${MODELS[@]}"
 do
+  echo "Executorch: Generating $MODEL model" 
   # TODO: Don't use the script in examples directory.
   python3 -m examples.apple.coreml.scripts.export --model_name "$MODEL" --save_processed_bytes
   mv -f "$MODEL""_coreml_all.pte" "$COREML_DIR_PATH/runtime/test/models"
   mv -f "$MODEL""_coreml_all.bin" "$COREML_DIR_PATH/runtime/test/models"
 done
+
+echo "Executorch: Generating stateful model"
+python3 "$SCRIPT_DIR_PATH/../runtime/test/export_stateful_model.py"
diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh
index 0018b5ffc2d..b3ea0d77ca0 100755
--- a/backends/apple/coreml/scripts/install_requirements.sh
+++ b/backends/apple/coreml/scripts/install_requirements.sh
@@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party"
 mkdir "$COREML_DIR_PATH/third-party"
 
 echo "${green}ExecuTorch: Cloning coremltools."
-git clone --depth 1 --branch 8.0b1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
+git clone --depth 1 --branch 8.0 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
 cd $COREMLTOOLS_DIR_PATH
 
 STATUS=$?
@@ -47,6 +47,11 @@ cmake --build "$COREMLTOOLS_DIR_PATH/build" --parallel
 
 echo "${green}ExecuTorch: Installing coremltools."
 pip install "$COREMLTOOLS_DIR_PATH"
+# CoreMLTools have started supporting numpy 2.0,
+# but ExecuTorch example model test env is still using older transformers,
+# so for now we will need to downgrade numpy to 1.x
+# TODO: Remove this numpy downgrade once later transformers starts to be used
+pip install numpy==1.26.4
 STATUS=$?
 if [ $STATUS -ne 0 ]; then
     echo "${red}ExecuTorch: Failed to install coremltools."
diff --git a/backends/apple/coreml/test/test_coreml_partitioner.py b/backends/apple/coreml/test/test_coreml_partitioner.py
index 34cf531b261..72a7fbf0932 100644
--- a/backends/apple/coreml/test/test_coreml_partitioner.py
+++ b/backends/apple/coreml/test/test_coreml_partitioner.py
@@ -4,11 +4,14 @@
 
 import unittest
 
+import coremltools as ct
+
 import executorch.exir
 
 import torch
 import torchvision
 
+from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
 
 
@@ -86,8 +89,54 @@ def test_vit_skip_conv(self):
             if node.op == "call_function"
         ] == total
 
+    def test_buffer(self):
+        embedding_dim = 3
+        max_seq_len = 2
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer(
+                    "cache",
+                    torch.zeros((max_seq_len, embedding_dim), dtype=torch.float32),
+                )
+
+            def forward(self, q, k_val, input_pos):
+                q_T = q.transpose(0, 1)
+                k = torch.ops.aten.index_put_(self.cache, [input_pos, None], k_val)
+                attn = k.mm(q_T)
+                return attn
+
+        model = Model()
+        model.eval()
+
+        q = torch.randn((1, embedding_dim))
+        k_val = torch.randn((1, embedding_dim))
+        input_pos = torch.tensor([0])
+        example_inputs = (q, k_val, input_pos)
+        exir_program_aten = torch.export.export(model, example_inputs)
+
+        compile_specs = CoreMLBackend.generate_compile_specs(
+            minimum_deployment_target=ct.target.iOS18
+        )
+        partitioner = CoreMLPartitioner(compile_specs=compile_specs)
+        edge_program_manager = executorch.exir.to_edge(
+            exir_program_aten, compile_config=self.edge_compile_config
+        )
+        delegated_program_manager = edge_program_manager.to_backend(partitioner)
+
+        assert [
+            node.target.__name__
+            for node in delegated_program_manager.exported_program().graph.nodes
+            if node.op == "call_function"
+        ] == [
+            "executorch_call_delegate",
+            "getitem",
+        ]
+
 
 if __name__ == "__main__":
     test_runner = TestCoreMLPartitioner()
     test_runner.test_add_sub_skip_mm()
     test_runner.test_vit_skip_conv()
+    test_runner.test_buffer()
diff --git a/backends/apple/mps/TARGETS b/backends/apple/mps/TARGETS
index b8ab3427a9e..1ab92b3fca0 100644
--- a/backends/apple/mps/TARGETS
+++ b/backends/apple/mps/TARGETS
@@ -95,8 +95,8 @@ runtime.python_test(
         "//executorch/examples/models:models",
         "//executorch/exir/tests:models",
         "//executorch/extension/export_util:export_util",
-        "//executorch/sdk:lib",
-        "//executorch/sdk/bundled_program/serialize:lib",
+        "//executorch/devtools:lib",
+        "//executorch/devtools/bundled_program/serialize:lib",
         "fbsource//third-party/pypi/pytest:pytest",
     ],
 )
diff --git a/backends/apple/mps/operators/node_visitor.py b/backends/apple/mps/operators/node_visitor.py
index d2f7219748a..2b443134bf8 100644
--- a/backends/apple/mps/operators/node_visitor.py
+++ b/backends/apple/mps/operators/node_visitor.py
@@ -77,7 +77,7 @@ def define_tensor(
         """Defines a tensor value into the MPSGraph serialization schema
 
         Args:
-            tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph
+            node (torch.fx.Node): EdgeIR tensor to define into mps_graph
             mps_graph (MPSGraph): MPSGraph object for serializing into flatbuffer
         """
 
@@ -155,7 +155,7 @@ def define_constant(
         """Defines a scalar value into the MPSGraph serialization schema
 
         Args:
-            tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph
+            constant_tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph
             mps_graph (MPSGraph): MPSGraph object for serializing into flatbuffer
         """
         constant_tensor = constant_tensor.contiguous()
@@ -191,7 +191,6 @@ def define_scalar(
         """Defines a scalar value into the MPSGraph serialization schema
 
         Args:
-            tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph
             mps_graph (MPSGraph): MPSGraph object for serializing into flatbuffer
         """
         assert isinstance(val, int) or isinstance(val, float)
@@ -229,7 +228,7 @@ def get_serialized_buffer(
         index of its placement in the constant buffer
 
         Args:
-            tensor (torch.fx.Node): _description_
+            node (torch.fx.Node): _description_
             mps_graph (MPSGraph): _description_
 
         Returns:
@@ -299,7 +298,7 @@ def get_serialized_id(
         the existent id.
 
         Args:
-            tensor (Union[torch.fx.Node, float]): _description_
+            node (Union[torch.fx.Node, float]): _description_
             mps_graph (MPSGraph): _description_
 
         Returns:
diff --git a/backends/apple/mps/runtime/MPSBackend.mm b/backends/apple/mps/runtime/MPSBackend.mm
index b94bdc9319b..cb96edbeb2e 100644
--- a/backends/apple/mps/runtime/MPSBackend.mm
+++ b/backends/apple/mps/runtime/MPSBackend.mm
@@ -19,7 +19,7 @@
 namespace torch {
 namespace executor {
 
-class MPSBackend final : public PyTorchBackendInterface {
+class MPSBackend final : public ::executorch::runtime::BackendInterface {
  public:
   ~MPSBackend() = default;
 
diff --git a/backends/apple/mps/runtime/operations/OperationUtils.mm b/backends/apple/mps/runtime/operations/OperationUtils.mm
index c3c5c93362a..2336868863d 100644
--- a/backends/apple/mps/runtime/operations/OperationUtils.mm
+++ b/backends/apple/mps/runtime/operations/OperationUtils.mm
@@ -31,8 +31,13 @@
       return MPSDataTypeFloat32;
     case DataType::mps_data_type_int8:
       return MPSDataTypeInt8;
-    case DataType::mps_data_type_int4:
-      return MPSDataTypeInt4;
+    case DataType::mps_data_type_int4: {
+      if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, *)) {
+        return MPSDataTypeInt4;
+      } else {
+        return ((MPSDataType)(MPSDataTypeSignedBit | 4));
+      }
+    }
     case DataType::mps_data_type_int16:
       return MPSDataTypeInt16;
     case DataType::mps_data_type_int32:
diff --git a/backends/apple/mps/runtime/operations/QuantDequant.mm b/backends/apple/mps/runtime/operations/QuantDequant.mm
index 7818bab2565..c37282f79a1 100644
--- a/backends/apple/mps/runtime/operations/QuantDequant.mm
+++ b/backends/apple/mps/runtime/operations/QuantDequant.mm
@@ -30,17 +30,19 @@
 
   MPSGraphTensor* inputTensor = getMPSGraphTensor(graphNode->input1_id());
   MPSGraphTensor* scalesTensor = getMPSGraphTensor(graphNode->scales_id());
-
-  MPSGraphTensor *zpTensor = [_mpsGraph constantWithScalar:0
+  if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, *)) {
+    MPSGraphTensor *zpTensor = [_mpsGraph constantWithScalar:0
                                                   dataType:MPSDataTypeInt4];
+    MPSGraphTensor *wDqTensor = [_mpsGraph dequantizeTensor:inputTensor
+                                                scaleTensor:scalesTensor
+                                            zeroPointTensor:zpTensor
+                                                  dataType:MPSDataTypeFloat16
+                                                      name:nil];
+    _idToMPSGraphTensor[graphNode->output_id()] = wDqTensor;
+  } else {
+    _idToMPSGraphTensor[graphNode->output_id()] = nil;
+  }
 
-  MPSGraphTensor *wDqTensor = [_mpsGraph dequantizeTensor:inputTensor
-                                              scaleTensor:scalesTensor
-                                          zeroPointTensor:zpTensor
-                                                dataType:MPSDataTypeFloat16
-                                                    name:nil];
-
-  _idToMPSGraphTensor[graphNode->output_id()] = wDqTensor;
   return Error::Ok;
 }
 
diff --git a/backends/apple/mps/targets.bzl b/backends/apple/mps/targets.bzl
index 8b9c64e143c..74d79448362 100644
--- a/backends/apple/mps/targets.bzl
+++ b/backends/apple/mps/targets.bzl
@@ -47,7 +47,7 @@ def define_common_targets(is_xplat = False, platforms = []):
             "//executorch/exir/backend:backend_lib",
             "//executorch/extension/pybindings/...",
             "//executorch/runtime/backend/...",
-            "//executorch/sdk/runners/...",
+            "//executorch/devtools/runners/...",
             "//executorch/test/...",
             "@EXECUTORCH_CLIENTS",
         ],
diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py
index d7efe8bde41..6f7d00d7b09 100644
--- a/backends/apple/mps/test/test_mps_utils.py
+++ b/backends/apple/mps/test/test_mps_utils.py
@@ -12,16 +12,16 @@
 import torch
 from executorch.backends.apple.mps import MPSBackend
 from executorch.backends.apple.mps.partition import MPSPartitioner
+from executorch.devtools import BundledProgram
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
 from executorch.exir import EdgeCompileConfig, ExirExportedProgram, to_edge
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.backend_details import CompileSpec
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.extension.export_util.utils import export_to_edge
-from executorch.sdk import BundledProgram
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
 from torch.export import export
 
 # Config for Capturing the weights, will be moved in the future
@@ -229,7 +229,7 @@ def lower_module_and_test_output(
         compile_specs = [CompileSpec("use_fp16", bytes([use_fp16]))]
 
         if use_partitioner:
-            logging.info(f"Edge IR graph:\n{edge_program.exported_program().graph}")
+            logging.info(f"Edge IR graph:\n{edge_program.exported_program()}")
             delegated_program = edge_program
             delegated_program = edge_program.to_backend(
                 MPSPartitioner(compile_specs=compile_specs)
@@ -239,9 +239,7 @@ def lower_module_and_test_output(
             )
 
             executorch_program = delegated_program.to_executorch(
-                config=ExecutorchBackendConfig(
-                    extract_delegate_segments=False, extract_constant_segment=False
-                )
+                config=ExecutorchBackendConfig(extract_delegate_segments=False)
             )
         else:
             delegated_program = to_backend(
@@ -258,9 +256,7 @@ def lower_module_and_test_output(
                     _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
                 ),
             ).to_executorch(
-                config=ExecutorchBackendConfig(
-                    extract_delegate_segments=False, extract_constant_segment=False
-                )
+                config=ExecutorchBackendConfig(extract_delegate_segments=False)
             )
 
         if bundled_program:
diff --git a/backends/apple/mps/utils/mps_utils.py b/backends/apple/mps/utils/mps_utils.py
index b6ba215534d..c31ebba0e46 100644
--- a/backends/apple/mps/utils/mps_utils.py
+++ b/backends/apple/mps/utils/mps_utils.py
@@ -73,7 +73,7 @@ def is_parameter(exp_prog: torch.export.ExportedProgram, node: torch.fx.Node) ->
     are supplied as inputs to the graph.
 
     Args:
-        edge_program (torch.export.ExportedProgram): _description_
+        exp_prog (torch.export.ExportedProgram): _description_
         node (torch.fx.Node): _description_
 
     Returns:
diff --git a/backends/arm/README.md b/backends/arm/README.md
index 7167aa853b6..6f4642f8d44 100644
--- a/backends/arm/README.md
+++ b/backends/arm/README.md
@@ -9,7 +9,7 @@ The expected flow is:
  * torch.nn.module -> TOSA -> command_stream for fully AoT flows e.g. embedded.
  * torch.nn.module -> TOSA for flows supporting a JiT compilation step.
 
-Current backend support is being developed for TOSA to Ethos(TM)-U55/65 via the
+Current backend support is being developed for TOSA to Ethos(TM)-U55/65/85 via the
 ethos-u-vela compilation stack. which follows the fully AoT flow.
 
 ## Layout
@@ -33,7 +33,7 @@ Quantization:
 - `arm_quantizer_utils.py` - Utilities for quantization
 
 Runtime:
-- `runtime/ArmBackendEthosU.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (PyTorchBackendInterface) for Ethos-U
+- `runtime/ArmBackendEthosU.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (BackendInterface) for Ethos-U
 
 Other:
 - `third-party/` - Dependencies on other code - in particular the TOSA serialization_lib for compiling to TOSA and the ethos-u-core-driver for the bare-metal backend supporting Ethos-U
diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS
new file mode 100644
index 00000000000..220db373710
--- /dev/null
+++ b/backends/arm/TARGETS
@@ -0,0 +1,83 @@
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+
+python_library(
+    name = "arm_partitioner",
+    srcs = [
+        "arm_partitioner.py",
+    ],
+    typing = True,
+    deps = [
+        ":arm_backend",
+        "//executorch/backends/arm/passes:passes",
+        "//executorch/exir:lib",
+    ],
+)
+
+python_library(
+    name = "arm_backend",
+    srcs = [
+        "arm_backend.py",
+    ],
+    typing = True,
+    deps = [
+        "fbsource//third-party/pypi/flatbuffers:flatbuffers",
+        "fbsource//third-party/pypi/ml-dtypes:ml-dtypes",
+        "fbsource//third-party/serialization_lib/python/serializer:serializer",
+        "fbsource//third-party/serialization_lib/python/tosa:tosa",
+        ":arm_vela",
+        "//executorch/backends/arm/operators:lib",
+        "//executorch/backends/arm/operators:node_visitor",
+        "//executorch/backends/arm/passes:passes",
+    ],
+)
+
+python_library(
+    name = "arm_vela",
+    srcs = [
+        "arm_vela.py",
+    ],
+    typing = True,
+    deps = [
+        "fbsource//third-party/pypi/ethos-u-vela:ethos-u-vela",
+    ],
+)
+
+python_library(
+    name = "tosa_mapping",
+    srcs = [
+        "tosa_mapping.py",
+    ],
+    typing = True,
+    deps = [
+        "fbsource//third-party/serialization_lib/python/serializer:serializer",
+        "//caffe2:torch",
+    ],
+)
+
+python_library(
+    name = "tosa_quant_utils",
+    srcs = [
+        "tosa_quant_utils.py",
+    ],
+    typing = True,
+    deps = [
+        "fbsource//third-party/pypi/numpy:numpy",
+        "fbsource//third-party/serialization_lib/python/serializer:serializer",
+        "fbsource//third-party/serialization_lib/python/tosa:tosa",
+        ":tosa_mapping",
+        "//executorch/exir/dialects:lib",
+    ],
+)
+
+python_library(
+    name = "tosa_utils",
+    srcs = [
+        "tosa_utils.py",
+    ],
+    typing = True,
+    deps = [
+        "fbsource//third-party/serialization_lib/python/serializer:serializer",
+        ":tosa_quant_utils",
+        "//executorch/backends/arm/operators:node_visitor",
+    ],
+)
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index f187191fee0..b83280763c2 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 #
 # Main implementation of AoT flow to partition and preprocess for Arm target
 # backends. Converts via TOSA as an intermediate form supported by AoT and
@@ -50,11 +52,11 @@ def __init__(self):
     def ethosu_compile_spec(
         self,
         config: str,
-        system_config: Optional[str] = None,
-        memory_mode: Optional[str] = None,
+        system_config: str,
+        memory_mode: str,
         extra_flags: Optional[str] = None,
         config_ini: Optional[str] = "Arm/vela.ini",
-    ):
+    ) -> "ArmCompileSpecBuilder":
         """
         Generate compile spec for Ethos-U NPU
 
@@ -84,7 +86,7 @@ def ethosu_compile_spec(
 
         return self
 
-    def tosa_compile_spec(self):
+    def tosa_compile_spec(self) -> "ArmCompileSpecBuilder":
         """
         Generate compile spec for TOSA flatbuffer output
         """
@@ -94,14 +96,18 @@ def tosa_compile_spec(self):
         self.output_format = "tosa"
         return self
 
-    def dump_intermediate_artifacts_to(self, output_path: str):
+    def dump_intermediate_artifacts_to(
+        self, output_path: str
+    ) -> "ArmCompileSpecBuilder":
         """
         Sets a path for dumping intermediate results during such as tosa and pte.
         """
         self.path_for_intermediates = output_path
         return self
 
-    def set_permute_memory_format(self, set_nhwc_permutation: bool = True):
+    def set_permute_memory_format(
+        self, set_nhwc_permutation: bool = True
+    ) -> "ArmCompileSpecBuilder":
         """
         Permute to channel last in compiler and runtime. Compilation and
         runtime will convert rank 4 inputs to channel last for each sub-graph.
@@ -109,7 +115,7 @@ def set_permute_memory_format(self, set_nhwc_permutation: bool = True):
         self.permute_nhwc = set_nhwc_permutation
         return self
 
-    def set_quantize_io(self, quantize_io: bool = False):
+    def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder":
         """
         Quantization of inputs and dequantization of outputs for cases where
         whole graph is quantized and method signature is not of quantized type.
@@ -117,7 +123,7 @@ def set_quantize_io(self, quantize_io: bool = False):
         self.quantize_io = quantize_io
         return self
 
-    def build(self):
+    def build(self) -> List[CompileSpec]:
         """
         Generate a list of compile spec objects from the builder
         """
@@ -159,13 +165,24 @@ def is_tosa(compile_spec: List[CompileSpec]) -> bool:
     return False
 
 
-def get_intermediate_path(compile_spec: List[CompileSpec]) -> str:
+def get_intermediate_path(compile_spec: List[CompileSpec]) -> Optional[str]:
     for spec in compile_spec:
         if spec.key == "debug_artifact_path":
             return spec.value.decode()
     return None
 
 
+def _get_first_delegation_tag(graph_module) -> str | None:
+    """Get the first delegation tag from the graph_module or return None."""
+    for node in graph_module.graph.nodes:
+        tag = node.meta.get("delegation_tag")
+        if tag:
+            return tag
+
+    logger.debug("No delegation tag found in partition.")
+    return None
+
+
 @final
 class ArmBackend(BackendDetails):
     @staticmethod
@@ -220,8 +237,13 @@ def preprocess(  # noqa: C901
         # TODO: It would be awesome if this dump could somehow be done on top level and not here.
         # Problem is that the desc.json has to be created on the tosa_graph object, which we can't
         # access from top level.
-        if artifact_path is not None:
-            dbg_tosa_dump(tosa_graph, artifact_path)
+        if artifact_path:
+            tag = _get_first_delegation_tag(graph_module)
+            dbg_tosa_dump(
+                tosa_graph,
+                artifact_path,
+                suffix="{}".format(f"_{tag}" if tag else ""),
+            )
 
         # Serialize and return the program. While we have always produced TOSA
         # output as an intermediate, some flows compile to device binaries in
diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py
index f73d97480bc..6b57c3d9658 100644
--- a/backends/arm/arm_partitioner.py
+++ b/backends/arm/arm_partitioner.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import logging
 import operator
 import os
@@ -39,10 +41,14 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.add.Tensor,
             exir_ops.edge.aten.addmm.default,
             exir_ops.edge.aten.expand_copy.default,
+            exir_ops.edge.aten.cat.default,
+            exir_ops.edge.aten.bmm.default,
             exir_ops.edge.aten.permute_copy.default,
             exir_ops.edge.aten.hardtanh.default,
             exir_ops.edge.aten.convolution.default,
             exir_ops.edge.aten.div.Tensor,
+            exir_ops.edge.aten.exp.default,
+            exir_ops.edge.aten.log.default,
             exir_ops.edge.aten.split_with_sizes_copy.default,
             exir_ops.edge.aten.full.default,
             exir_ops.edge.aten.mul.Tensor,
@@ -51,12 +57,14 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.sigmoid.default,
             exir_ops.edge.aten.mm.default,
             exir_ops.edge.aten.repeat.default,
+            exir_ops.edge.aten.relu.default,
             exir_ops.edge.aten._softmax.default,
             exir_ops.edge.aten.slice_copy.Tensor,
             exir_ops.edge.aten.sub.Tensor,
             exir_ops.edge.aten.view_copy.default,
             exir_ops.edge.aten.clone.default,
             exir_ops.edge.aten.mean.dim,
+            exir_ops.edge.aten.unsqueeze_copy.default,
             operator.getitem,
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
             exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py
index f387672b7b4..01bb8bd55e5 100644
--- a/backends/arm/arm_vela.py
+++ b/backends/arm/arm_vela.py
@@ -3,14 +3,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import os
 import struct
-import subprocess
 import tempfile
 
 from typing import List
 
 import numpy as np
+from ethosu.vela import vela
 
 
 # Pack either input or output tensor block, compose the related arrays into
@@ -38,21 +40,22 @@ def vela_compile(tosa_graph, args: List[str]):
     with tempfile.TemporaryDirectory() as tmpdir:
         tosaname = "out.tosa"
         flatbuffer = tosa_graph.serialize()
-        with open(os.path.join(tmpdir, tosaname), "wb") as f:
+        tosa_path = os.path.join(tmpdir, tosaname)
+        with open(tosa_path, "wb") as f:
             f.write(flatbuffer)
 
         # invoke vela
-        vela_command = f"cd {tmpdir}; vela {' '.join(args)} {tosaname}"
-        try:
-            subprocess.run([vela_command], shell=True, check=True, capture_output=True)
-        except subprocess.CalledProcessError as process_error:
-            raise RuntimeError(
-                f"Vela compiler ('{vela_command}') failed with error:\n \
-                                     {process_error.stderr.decode()}\n \
-                                      Stdout:\n{process_error.stdout.decode()}"
-            )
-
-        np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
+        output_dir = os.path.join(tmpdir, "output")
+        args.append(f"--output-dir={output_dir}")
+        args.append(tosa_path)
+        vela.main(" ".join(args).split(" "))
+
+        if any("ethos-u85" in arg for arg in args) or any(
+            "debug-force-regor" in arg for arg in args
+        ):
+            np_path = os.path.join(tmpdir, "output", "out_vela.npz")
+        else:
+            np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
         blocks = b""
 
         with np.load(np_path, allow_pickle=False) as data:
diff --git a/backends/arm/operators/TARGETS b/backends/arm/operators/TARGETS
new file mode 100644
index 00000000000..fd04d5fb847
--- /dev/null
+++ b/backends/arm/operators/TARGETS
@@ -0,0 +1,34 @@
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+
+python_library(
+    name = "node_visitor",
+    srcs = ["node_visitor.py"],
+    typing = True,
+    deps = [
+        "//executorch/backends/arm:tosa_mapping",
+    ],
+)
+
+python_library(
+    name = "ops",
+    srcs = glob(["op_*.py"]),
+    typing = True,
+    deps = [
+        "fbsource//third-party/serialization_lib/python/tosa:tosa",
+        ":node_visitor",
+        "//executorch/backends/arm:tosa_mapping",
+        "//executorch/backends/arm:tosa_quant_utils",
+        "//executorch/backends/arm:tosa_utils",
+        "//executorch/exir:lib",
+    ],
+)
+
+python_library(
+    name = "lib",
+    srcs = ["__init__.py"],
+    typing = True,
+    deps = [
+        ":node_visitor",
+        ":ops",
+    ],
+)
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index 94a16d8c941..7b94bfa837d 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -3,27 +3,35 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 from . import (  # noqa
     node_visitor,
     op_add,
     op_addmm,
     op_avg_pool2d,
     op_batch_norm,
+    op_bmm,
+    op_cat,
     op_conv2d,
     op_dequant,
     op_div,
+    op_exp,
     op_full,
     op_get_item,
     op_hardtanh,
+    op_log,
     op_mean_dim,
     op_mm,
     op_mul,
     op_permute,
     op_quant,
+    op_relu,
     op_repeat,
     op_sigmoid,
     op_slice,
     op_softmax,
     op_sub,
+    op_unsqueeze,
     op_view,
 )
diff --git a/backends/arm/operators/node_visitor.py b/backends/arm/operators/node_visitor.py
index 59edc01e745..99fd0388e45 100644
--- a/backends/arm/operators/node_visitor.py
+++ b/backends/arm/operators/node_visitor.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 from typing import Dict, List
 
 import serializer.tosa_serializer as ts
diff --git a/backends/arm/operators/op_add.py b/backends/arm/operators/op_add.py
index 33c0c49744b..ec2ade9e8ad 100644
--- a/backends/arm/operators/op_add.py
+++ b/backends/arm/operators/op_add.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 from typing import List
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
diff --git a/backends/arm/operators/op_addmm.py b/backends/arm/operators/op_addmm.py
index 4a0581376c2..b4f782db4a3 100644
--- a/backends/arm/operators/op_addmm.py
+++ b/backends/arm/operators/op_addmm.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 from typing import List
 
 import serializer.tosa_serializer as ts
diff --git a/backends/arm/operators/op_avg_pool2d.py b/backends/arm/operators/op_avg_pool2d.py
index e6d07610c81..4caaad92028 100644
--- a/backends/arm/operators/op_avg_pool2d.py
+++ b/backends/arm/operators/op_avg_pool2d.py
@@ -2,6 +2,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
 from typing import List
 
 import serializer.tosa_serializer as ts
diff --git a/backends/arm/operators/op_batch_norm.py b/backends/arm/operators/op_batch_norm.py
index c41941722b3..d17c3a1b81f 100644
--- a/backends/arm/operators/op_batch_norm.py
+++ b/backends/arm/operators/op_batch_norm.py
@@ -2,6 +2,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
 from typing import List
 
 import serializer.tosa_serializer as ts
diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py
new file mode 100644
index 00000000000..161b5d22396
--- /dev/null
+++ b/backends/arm/operators/op_bmm.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import List
+
+import serializer.tosa_serializer as ts
+import torch.fx
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_quant_utils import build_rescale, get_quant_node_args
+from executorch.backends.arm.tosa_utils import get_two_inputs
+from serializer.tosa_serializer import TosaOp
+
+
+@register_node_visitor
+class BMMVisitor(NodeVisitor):
+    target = "aten.bmm.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+        input0, input1 = get_two_inputs(node)
+
+        # aten.bmm maps directly to MATMUL
+        # NOTE: For now, only INT8 & FP32 is supported
+
+        # For INT8, we need to get the zero points and add an intermediate tensor
+        # for a later rescale.
+        if is_quant_node:
+            input0_zp = get_quant_node_args(input0).zp
+            input1_zp = get_quant_node_args(input1).zp
+            bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
+            bmm_output_name = bmm_result.name
+        else:
+            input0_zp, input1_zp = 0, 0
+            bmm_output_name = output.name
+
+        # Add the MATMUL to the TOSA graph.
+        attr = ts.TosaSerializerAttribute()
+        attr.MatMulAttribute(A_zp=input0_zp, B_zp=input1_zp)
+
+        tosa_graph.addOperator(
+            TosaOp.Op().MATMUL,
+            [input0.name, input1.name],
+            [bmm_output_name],
+            attr,
+        )
+
+        # As INT8 accumulates into INT32, we need to rescale it back to INT8
+        if is_quant_node:
+            input0_q_params = get_quant_node_args(input0)
+            input1_q_params = get_quant_node_args(input1)
+            output_q_params = get_quant_node_args(list(node.users)[0])
+
+            final_output_scale = (
+                input0_q_params.scale * input1_q_params.scale
+            ) / output_q_params.scale
+
+            build_rescale(
+                tosa_fb=tosa_graph,
+                scale=final_output_scale,
+                # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined.
+                input_node=bmm_result,
+                output_name=output.name,
+                output_type=ts.DType.INT8,
+                output_shape=bmm_result.shape,
+                input_zp=0,
+                output_zp=output_q_params.zp,
+                is_double_round=False,
+            )
diff --git a/backends/arm/operators/op_cat.py b/backends/arm/operators/op_cat.py
new file mode 100644
index 00000000000..652eb397371
--- /dev/null
+++ b/backends/arm/operators/op_cat.py
@@ -0,0 +1,47 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import List
+
+import serializer.tosa_serializer as ts
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class CatVisitor(NodeVisitor):
+    target = "aten.cat.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+
+        tensors = inputs[0].special
+        dim = 0 if len(inputs) < 2 else inputs[1].number
+        rank = len(output.shape)
+        dim = (dim + rank) % rank
+        dim = output.dim_order.index(dim)
+
+        attr = ts.TosaSerializerAttribute()
+        attr.AxisAttribute(dim)
+
+        tosa_graph.addOperator(
+            TosaOp.Op().CONCAT, [tensor.name for tensor in tensors], [output.name], attr
+        )
diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py
index 323b11601cb..64cde0724f5 100644
--- a/backends/arm/operators/op_conv2d.py
+++ b/backends/arm/operators/op_conv2d.py
@@ -2,7 +2,9 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-from typing import List
+
+# pyre-unsafe
+from typing import cast, List
 
 import serializer.tosa_serializer as ts
 import torch
@@ -40,7 +42,7 @@ def adjust_pad_if_needed(self, input, weight, stride, pad, dilation):
 
         if mod_remainder > pad:
             raise RuntimeError(
-                f"ignoring input element is not currently supported, got a large stride {stride}"
+                "This case should be handled by the SizeAdjustConv2d pass, is it enabled?"
             )
         return pad - mod_remainder
 
@@ -156,11 +158,12 @@ def define_node(
         # integer value domain of the next op. Otherwise return float32 output.
         if is_quant_node:
             # Get scale_factor from input, weight, and output.
-            _, input_scale, _, _, _, _ = getNodeArgs(node.args[0])
-            _, weight_scale, _, _, _, _ = getNodeArgs(node.args[1])
+            _, input_scale, _, _, _, _ = getNodeArgs(cast(torch.fx.Node, node.args[0]))
+            _, weight_scale, _, _, _, _ = getNodeArgs(cast(torch.fx.Node, node.args[1]))
             _, output_scale, output_zp, _, _, _ = getNodeArgs(list(node.users)[0])
             build_rescale_conv_output(
                 tosa_graph,
+                # pyre-fixme[61]: Uninitialized local [61]: Local variable `conv2d_res` is undefined, or not always defined.
                 conv2d_res,
                 output.name,
                 actual_out_type,
diff --git a/backends/arm/operators/op_dequant.py b/backends/arm/operators/op_dequant.py
index 269afceccb7..afa1dda9467 100644
--- a/backends/arm/operators/op_dequant.py
+++ b/backends/arm/operators/op_dequant.py
@@ -2,6 +2,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
 from typing import List
 
 import serializer.tosa_serializer as ts
diff --git a/backends/arm/operators/op_div.py b/backends/arm/operators/op_div.py
index e365cf6cfe2..0857e0ed32a 100644
--- a/backends/arm/operators/op_div.py
+++ b/backends/arm/operators/op_div.py
@@ -2,6 +2,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
 from typing import List
 
 import serializer.tosa_serializer as ts
diff --git a/backends/arm/operators/op_exp.py b/backends/arm/operators/op_exp.py
new file mode 100644
index 00000000000..f98bb3f88c2
--- /dev/null
+++ b/backends/arm/operators/op_exp.py
@@ -0,0 +1,83 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import List
+
+import numpy as np
+
+import serializer.tosa_serializer as ts
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+
+from executorch.backends.arm.tosa_quant_utils import (
+    dequantize_value,
+    get_quant_node_args,
+    QuantArgs,
+    quantize_value,
+)
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class ExpVisitor(NodeVisitor):
+    target = "aten.exp.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+
+        assert len(node.all_input_nodes) == 1
+        assert len(node.users) == 1
+
+        if is_quant_node:
+            # Assume quantized input is 8 bit.
+
+            # Create attribute for 8 bit table lookup.
+            input_node = node.all_input_nodes[0]
+            in_quantargs = get_quant_node_args(input_node)
+            output_node = list(node.users)[0]
+            out_quantargs = get_quant_node_args(output_node)
+
+            table = exp_table_8bit(in_quantargs, out_quantargs)
+            table_attr = ts.TosaSerializerAttribute()
+            table_attr.TableAttribute(table)
+
+            tosa_graph.addOperator(
+                TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr
+            )
+        else:
+            tosa_graph.addOperator(TosaOp.Op().EXP, [inputs[0].name], [output.name])
+
+
+def exp_table_8bit(in_quantargs: QuantArgs, out_quantargs: QuantArgs):
+    """
+    Returns a table mapping 256 entries to exp([qmin,qmax])
+    """
+
+    def exp(x):
+        # Convert quantized input to floating point exp input space.
+        v = dequantize_value(x, in_quantargs)
+        # Compute exp.
+        v = np.exp(v)
+        # Convert exp output back to quantized space.
+        return quantize_value(v, out_quantargs)
+
+    return [
+        exp(x)
+        for x in np.linspace(in_quantargs.qmin, in_quantargs.qmax, 256, dtype=np.int8)
+    ]
diff --git a/backends/arm/operators/op_full.py b/backends/arm/operators/op_full.py
index f929b02ee67..eec27bb9090 100644
--- a/backends/arm/operators/op_full.py
+++ b/backends/arm/operators/op_full.py
@@ -2,6 +2,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
 from typing import List
 
 import numpy as np
diff --git a/backends/arm/operators/op_get_item.py b/backends/arm/operators/op_get_item.py
index 59004f49686..a696b33aa75 100644
--- a/backends/arm/operators/op_get_item.py
+++ b/backends/arm/operators/op_get_item.py
@@ -2,6 +2,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
 from typing import List
 
 import serializer.tosa_serializer as ts
diff --git a/backends/arm/operators/op_hardtanh.py b/backends/arm/operators/op_hardtanh.py
index 3d58f6d628c..62c0a27f05f 100644
--- a/backends/arm/operators/op_hardtanh.py
+++ b/backends/arm/operators/op_hardtanh.py
@@ -2,6 +2,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
 from typing import List
 
 import serializer.tosa_serializer as ts
diff --git a/backends/arm/operators/op_log.py b/backends/arm/operators/op_log.py
new file mode 100644
index 00000000000..5276173efa3
--- /dev/null
+++ b/backends/arm/operators/op_log.py
@@ -0,0 +1,83 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import List
+
+import numpy as np
+
+import serializer.tosa_serializer as ts
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+
+from executorch.backends.arm.tosa_quant_utils import (
+    dequantize_value,
+    get_quant_node_args,
+    QuantArgs,
+    quantize_value,
+)
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class LogVisitor(NodeVisitor):
+    target = "aten.log.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+
+        assert len(node.all_input_nodes) == 1
+        assert len(node.users) == 1
+
+        if is_quant_node:
+            # Assume quantized input is 8 bit.
+
+            # Create attribute for 8 bit table lookup.
+            input_node = node.all_input_nodes[0]
+            in_quantargs = get_quant_node_args(input_node)
+            output_node = list(node.users)[0]
+            out_quantargs = get_quant_node_args(output_node)
+
+            table = log_table_8bit(in_quantargs, out_quantargs)
+            table_attr = ts.TosaSerializerAttribute()
+            table_attr.TableAttribute(table)
+
+            tosa_graph.addOperator(
+                TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr
+            )
+        else:
+            tosa_graph.addOperator(TosaOp.Op().LOG, [inputs[0].name], [output.name])
+
+
+def log_table_8bit(in_quantargs: QuantArgs, out_quantargs: QuantArgs):
+    """
+    Returns a table mapping 256 entries to log([qmin,qmax])
+    """
+
+    def log(x):
+        # Convert quantized input to floating point log input space.
+        v = dequantize_value(x, in_quantargs)
+        # Compute log.
+        v = np.log(v)
+        # Convert log output back to quantized space.
+        return quantize_value(v, out_quantargs)
+
+    return [
+        log(x)
+        for x in np.linspace(in_quantargs.qmin, in_quantargs.qmax, 256, dtype=np.int8)
+    ]
diff --git a/backends/arm/operators/op_mean_dim.py b/backends/arm/operators/op_mean_dim.py
index 20e1b2b8d76..3c9aea30856 100644
--- a/backends/arm/operators/op_mean_dim.py
+++ b/backends/arm/operators/op_mean_dim.py
@@ -2,6 +2,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
 from typing import List
 
 import serializer.tosa_serializer as ts
@@ -11,7 +13,6 @@
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from executorch.backends.arm.tosa_utils import build_avg_pool_2d_common
 
 
 @register_node_visitor
@@ -30,29 +31,4 @@ def define_node(
         is_quant_node: bool,
     ) -> None:
 
-        input_tensor = inputs[0]
-        dim = node.args[1]
-        keep_dim = node.args[2]
-
-        # mean.dim(-1, -2) is the same as avg_pool2d when just computing mean over HW dimensions.
-        # Since tosa doesn't have mean.dim operation, lowers it to average pooling instead.
-        if dim == [-1, -2]:
-            if keep_dim is True:
-                # Given the shape format of input is (N, C, H, W)
-                kernel_size = [input_tensor.shape[2], input_tensor.shape[3]]
-                stride = [1, 1]
-                padding = [0, 0, 0, 0]
-
-                build_avg_pool_2d_common(
-                    node,
-                    tosa_graph,
-                    input_tensor,
-                    kernel_size,
-                    stride,
-                    padding,
-                    is_quant_node,
-                    output,
-                )
-                return
-
         raise AssertionError("unsupported")
diff --git a/backends/arm/operators/op_mm.py b/backends/arm/operators/op_mm.py
index f7097022f12..ebddb3a40e2 100644
--- a/backends/arm/operators/op_mm.py
+++ b/backends/arm/operators/op_mm.py
@@ -3,6 +3,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
 from typing import List
 
 import serializer.tosa_serializer as ts
@@ -96,6 +98,7 @@ def define_node(
             build_rescale(
                 tosa_fb=tosa_graph,
                 scale=final_output_scale,
+                # pyre-ignore[61]: Uninitialized local [61]: Local variable `reshape_intermediate` is undefined, or not always defined.
                 input_node=reshape_intermediate,
                 output_name=output.name,
                 output_type=ts.DType.INT8,
diff --git a/backends/arm/operators/op_mul.py b/backends/arm/operators/op_mul.py
index e9cbfcbd7cc..c152e8759ef 100644
--- a/backends/arm/operators/op_mul.py
+++ b/backends/arm/operators/op_mul.py
@@ -3,7 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import List
+# pyre-unsafe
+
+from typing import cast, List
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 import executorch.backends.arm.tosa_utils as tutils
@@ -35,8 +37,12 @@ def define_node(
         if is_quant_node:
             input_A = inputs[0]
             input_B = inputs[1]
-            input_A_qargs = tqutils.get_quant_node_args(node.args[0])
-            input_B_qargs = tqutils.get_quant_node_args(node.args[1])
+            input_A_qargs = tqutils.get_quant_node_args(
+                cast(torch.fx.Node, node.args[0])
+            )
+            input_B_qargs = tqutils.get_quant_node_args(
+                cast(torch.fx.Node, node.args[1])
+            )
 
             input_A.shape = tutils.tosa_shape(input_A.shape, input_A.dim_order)
             input_B.shape = tutils.tosa_shape(input_B.shape, input_B.dim_order)
diff --git a/backends/arm/operators/op_output.py b/backends/arm/operators/op_output.py
index 7d163114aa8..1b053b18edc 100644
--- a/backends/arm/operators/op_output.py
+++ b/backends/arm/operators/op_output.py
@@ -3,6 +3,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
+from typing import cast
+
 import serializer.tosa_serializer as ts
 import torch
 
@@ -11,7 +15,7 @@ def process_output(
     node: torch.fx.Node,
     tosa_graph: ts.TosaSerializer,
 ):
-    for output in node.args[0]:
+    for output in cast(tuple[torch.fx.Node, ...], node.args[0]):
         tosa_graph.addOutputTensor(
             tosa_graph.currRegion.currBasicBlock.tensors[output.name]
         )
diff --git a/backends/arm/operators/op_permute.py b/backends/arm/operators/op_permute.py
index eafd6af3678..167a0c382f4 100644
--- a/backends/arm/operators/op_permute.py
+++ b/backends/arm/operators/op_permute.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 from typing import List
 
 import serializer.tosa_serializer as ts
diff --git a/backends/arm/operators/op_placeholder.py b/backends/arm/operators/op_placeholder.py
index 0b2e65f45d0..b5dcf3f9873 100644
--- a/backends/arm/operators/op_placeholder.py
+++ b/backends/arm/operators/op_placeholder.py
@@ -3,9 +3,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import numpy as np
 import serializer.tosa_serializer as ts
-import torch
+import torch.fx
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_quant_utils import (
     get_quant_arg_dtype,
@@ -130,6 +132,21 @@ def process_inputs_to_buffers(
     )
 
 
+def process_inputs_to_lifted_tensor_constants(
+    node: torch.fx.Node,
+    tosa_graph: ts.TosaSerializer,
+    edge_program: ExportedProgram,
+):
+    arg = TosaArg(node)
+    tensor_name = edge_program.graph_signature.inputs_to_lifted_tensor_constants[
+        arg.name
+    ]
+    tensor = edge_program.tensor_constants[tensor_name]
+    tensor_data = tensor.detach().numpy()
+
+    tosa_graph.addConst(tensor_data.shape, arg.dtype, tensor_data, name=arg.name)
+
+
 def process_placeholder(
     node: torch.fx.Node,
     tosa_graph: ts.TosaSerializer,
@@ -145,5 +162,11 @@ def process_placeholder(
         process_inputs_to_parameters(node, tosa_graph, edge_program)
     elif node.name in edge_program.graph_signature.inputs_to_buffers:
         process_inputs_to_buffers(node, tosa_graph, edge_program)
+    elif node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants:
+        process_inputs_to_lifted_tensor_constants(node, tosa_graph, edge_program)
+    elif node.name in edge_program.graph_signature.inputs_to_lifted_custom_objs:
+        raise NotImplementedError(
+            "Placeholder is of type 'lifted custom object' which is not supported."
+        )
     else:
-        raise RuntimeError(f"Unknown placeholder {node.name}")
+        raise RuntimeError(f"Placeholder '{node.name}' is of unknown type.")
diff --git a/backends/arm/operators/op_quant.py b/backends/arm/operators/op_quant.py
index e6a62b3f206..8f83e79442d 100644
--- a/backends/arm/operators/op_quant.py
+++ b/backends/arm/operators/op_quant.py
@@ -2,6 +2,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
 from typing import List
 
 import serializer.tosa_serializer as ts
diff --git a/backends/arm/operators/op_relu.py b/backends/arm/operators/op_relu.py
new file mode 100644
index 00000000000..20bba3f6545
--- /dev/null
+++ b/backends/arm/operators/op_relu.py
@@ -0,0 +1,57 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import executorch.backends.arm.tosa_quant_utils as tqutils
+import serializer.tosa_serializer as ts
+import torch.fx
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from serializer.tosa_serializer import TosaOp
+
+
+@register_node_visitor
+class ReluVisitor(NodeVisitor):
+    target = "aten.relu.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: list[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+        attr = ts.TosaSerializerAttribute()
+
+        clamp_min_fp = 0.0
+        clamp_max_fp = 0.0
+        clamp_min_qs = 0
+        clamp_max_qs = 0
+        if is_quant_node:
+            out_qargs = tqutils.get_quant_node_args(list(node.users)[0])
+            clamp_min_qs = tqutils.quantize_value(0, out_qargs)
+            clamp_max_qs = tqutils.quantize_value(float("inf"), out_qargs)
+
+        else:
+            clamp_min_fp = 0
+            clamp_max_fp = float("inf")
+
+        attr.ClampAttribute(
+            tosa_graph.builder,
+            clamp_min_qs,
+            clamp_max_qs,
+            clamp_min_fp,
+            clamp_max_fp,
+        )
+
+        tosa_graph.addOperator(TosaOp.Op().CLAMP, [inputs[0].name], [output.name], attr)
diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py
index 261fcca12e7..20de9e0846a 100644
--- a/backends/arm/operators/op_repeat.py
+++ b/backends/arm/operators/op_repeat.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import serializer.tosa_serializer as ts
 import torch
 from executorch.backends.arm.operators.node_visitor import (
diff --git a/backends/arm/operators/op_sigmoid.py b/backends/arm/operators/op_sigmoid.py
index 884c803482b..0087b1f7a81 100644
--- a/backends/arm/operators/op_sigmoid.py
+++ b/backends/arm/operators/op_sigmoid.py
@@ -2,6 +2,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
 from typing import List
 
 import numpy as np
diff --git a/backends/arm/operators/op_slice.py b/backends/arm/operators/op_slice.py
index e562e0724e2..0dfb287cd75 100644
--- a/backends/arm/operators/op_slice.py
+++ b/backends/arm/operators/op_slice.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 from typing import List
 
 import serializer.tosa_serializer as ts
diff --git a/backends/arm/operators/op_softmax.py b/backends/arm/operators/op_softmax.py
index 627fa64aed1..1ac42413189 100644
--- a/backends/arm/operators/op_softmax.py
+++ b/backends/arm/operators/op_softmax.py
@@ -2,6 +2,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
 from typing import List
 
 import serializer.tosa_serializer as ts
@@ -33,7 +35,7 @@ def define_node(
         input_name = inputs[0].name
         dim_order = inputs[0].dim_order
         input_shape = tosa_shape(inputs[0].shape, dim_order)
-        dim_value = dim_order.index(inputs[1].number)
+        dim_value = dim_order.index(inputs[1].number % len(dim_order))
 
         ## softmax = exp(logits - max(logits)) / reduce_sum(exp(logits - max(logits)), -1)
         # FP32
diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py
index 3dc1519f370..2089b6e9e96 100644
--- a/backends/arm/operators/op_sub.py
+++ b/backends/arm/operators/op_sub.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 from typing import List
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
diff --git a/backends/arm/operators/op_unsqueeze.py b/backends/arm/operators/op_unsqueeze.py
new file mode 100644
index 00000000000..c14128fdc8c
--- /dev/null
+++ b/backends/arm/operators/op_unsqueeze.py
@@ -0,0 +1,53 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#  Follows this specification: https://pytorch.org/docs/stable/generated/torch.unsqueeze.html
+
+# pyre-unsafe
+
+import serializer.tosa_serializer as ts
+import torch.fx
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_utils import tosa_shape
+from serializer.tosa_serializer import TosaOp
+
+
+@register_node_visitor
+class UnsqueezeVisitor(NodeVisitor):
+    target = "aten.unsqueeze_copy.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: list[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+
+        dim = inputs[1].number
+        shape = inputs[0].shape
+        rank = len(shape)
+
+        assert -rank - 1 <= dim < rank + 1
+        if dim < 0:
+            dim = dim + rank + 1
+
+        new_shape = list(shape)
+        new_shape.insert(dim, 1)
+        new_shape = tosa_shape(new_shape, output.dim_order)
+
+        attr = ts.TosaSerializerAttribute()
+        attr.ReshapeAttribute(new_shape)
+        tosa_graph.addOperator(
+            TosaOp.Op().RESHAPE, [inputs[0].name], [output.name], attr
+        )
diff --git a/backends/arm/operators/op_view.py b/backends/arm/operators/op_view.py
index 682eacd5e38..8667df590dc 100644
--- a/backends/arm/operators/op_view.py
+++ b/backends/arm/operators/op_view.py
@@ -2,10 +2,13 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
 from typing import List
 
 import serializer.tosa_serializer as ts
 import torch
+import tosa.Op as TosaOp
 
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
@@ -13,7 +16,6 @@
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_utils import tosa_shape
-from serializer.tosa_serializer import TosaOp
 
 
 @register_node_visitor
diff --git a/backends/arm/passes/TARGETS b/backends/arm/passes/TARGETS
new file mode 100644
index 00000000000..ca20b03fccd
--- /dev/null
+++ b/backends/arm/passes/TARGETS
@@ -0,0 +1,12 @@
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+
+python_library(
+    name = "passes",
+    srcs = glob(["*.py"]),
+    typing = True,
+    deps = [
+        "//executorch/backends/arm:tosa_quant_utils",
+        "//executorch/backends/arm:tosa_utils",
+        "//executorch/exir:lib",
+    ],
+)
diff --git a/backends/arm/passes/annotate_channels_last_dim_order_pass.py b/backends/arm/passes/annotate_channels_last_dim_order_pass.py
index 9bb45c504a4..222c0a7cb36 100644
--- a/backends/arm/passes/annotate_channels_last_dim_order_pass.py
+++ b/backends/arm/passes/annotate_channels_last_dim_order_pass.py
@@ -4,6 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
+from typing import cast
+
 import torch
 from executorch.backends.arm.tosa_quant_utils import dq_op
 from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d
@@ -28,9 +32,11 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
             if node.target != dq_op:
                 return False
             prev_node = node.args[0]
-            if prev_node.op != "placeholder":
+            if cast(torch.fx.Node, prev_node).op != "placeholder":
                 return False
-            return is_consumer_node_depthwise_conv2d(node)
+            if is_consumer_node_depthwise_conv2d(node):
+                consumer_node = list(node.users)[0]
+                return consumer_node.args[1] == node
         elif node.op == "placeholder":
             # node is an input, weight or bias node
             consumer_node = list(node.users)[0]
@@ -46,7 +52,9 @@ def call(self, graph_module: torch.fx.GraphModule):
         NHWC_Order = (0, 2, 3, 1)
         HWCM_Order = (2, 3, 0, 1)
         for node in graph_module.graph.nodes:
-            if isinstance(node.meta["val"], tuple):
+            if isinstance(
+                node.meta["val"], (tuple, torch.fx.immutable_collections.immutable_list)
+            ):
                 node_data = node.meta["val"][0].data
             else:
                 node_data = node.meta["val"].data
diff --git a/backends/arm/passes/arm_pass_manager.py b/backends/arm/passes/arm_pass_manager.py
index 054d823dbbb..75ef551171e 100644
--- a/backends/arm/passes/arm_pass_manager.py
+++ b/backends/arm/passes/arm_pass_manager.py
@@ -5,6 +5,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import torch
 from executorch.backends.arm.passes.annotate_channels_last_dim_order_pass import (
     AnnotateChannelsLastDimOrder,
@@ -15,22 +17,28 @@
 from executorch.backends.arm.passes.convert_split_to_slice import (
     ConvertSplitToSlicePass,
 )
+from executorch.backends.arm.passes.meandim_to_averagepool_pass import (
+    ConvertMeanDimToAveragePool,
+)
 from executorch.backends.arm.passes.remove_clone_pass import RemoveClonePass
+from executorch.backends.arm.passes.size_adjust_conv2d_pass import SizeAdjustConv2DPass
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.pass_manager import PassManager
 
 
 class ArmPassManager(PassManager):
 
-    def _transform(self, graph_module: torch.fx.Graph):
+    def _transform(self, graph_module: torch.fx.GraphModule):
         return self(graph_module).graph_module
 
     def transform_to_backend_pipeline(
-        self, graph_module: torch.fx.Graph, compile_spec: CompileSpec
+        self, graph_module: torch.fx.GraphModule, compile_spec: list[CompileSpec]
     ):
         """Apply passes before transforming program to backend"""
+        self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(RemoveClonePass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
+        self.add_pass(ConvertMeanDimToAveragePool())
         self.add_pass(ConvertSplitToSlicePass())
         for spec in compile_spec:
             if spec.key == "permute_memory_format":
diff --git a/backends/arm/passes/convert_expand_copy_to_repeat.py b/backends/arm/passes/convert_expand_copy_to_repeat.py
index 53138682d56..249c014ae67 100644
--- a/backends/arm/passes/convert_expand_copy_to_repeat.py
+++ b/backends/arm/passes/convert_expand_copy_to_repeat.py
@@ -4,6 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
+from typing import cast
+
 import torch.fx
 from executorch.backends.arm.tosa_mapping import extract_tensor_meta
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -31,7 +35,7 @@ def call(self, graph_module: torch.fx.GraphModule):
 
                 expand_node = src_partition.nodes[0]
                 _, shape, _ = extract_tensor_meta(expand_node.all_input_nodes[0].meta)
-                multiples = expand_node.args[1]
+                multiples = cast(tuple[int], expand_node.args[1])
                 expanded_rank = len(multiples)
 
                 # Expanded shape is 'shape' front-padded with ones.
diff --git a/backends/arm/passes/convert_split_to_slice.py b/backends/arm/passes/convert_split_to_slice.py
index ff978d4d9ec..29aae37fe9e 100644
--- a/backends/arm/passes/convert_split_to_slice.py
+++ b/backends/arm/passes/convert_split_to_slice.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import torch.fx
 from executorch.backends.arm.tosa_mapping import extract_tensor_meta
 from executorch.exir.dialects._ops import ops as exir_ops
diff --git a/backends/arm/passes/meandim_to_averagepool_pass.py b/backends/arm/passes/meandim_to_averagepool_pass.py
new file mode 100644
index 00000000000..0974eac740c
--- /dev/null
+++ b/backends/arm/passes/meandim_to_averagepool_pass.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import Any, cast, Dict, Tuple
+
+import torch.fx
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
+
+Argument = Any
+
+
+class ConvertMeanDimToAveragePool(ExportPass):
+    """
+    Replace a mean operation with dim = [-1, -2] and keep_dim = True with an average pool operation.
+    """
+
+    def call_operator(
+        self,
+        op: torch.fx.node.Target,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op != exir_ops.edge.aten.mean.dim:
+            return super().call_operator(op, args, kwargs, meta)
+
+        input_value = cast(ProxyValue, args[0])
+        dim = cast(list, args[1])
+        keep_dim = cast(bool, args[2]) if len(args) > 2 else False
+
+        # averagepool2d gets converted to a mean operation with dim = [-1, -2] and keep_dim = True
+        # so check the dim argument for this case
+        if dim == [-1, -2] and keep_dim is True:
+            # Given the shape format of input is (N, C, H, W)
+            kernel_size = [
+                input_value.to_tensor().size()[2],
+                input_value.to_tensor().size()[3],
+            ]
+            stride = [1, 1]
+            return super().call_operator(
+                exir_ops.edge.aten.avg_pool2d.default,
+                (input_value, kernel_size, stride),
+                {},
+                meta,
+            )
+        else:
+            return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/passes/remove_clone_pass.py b/backends/arm/passes/remove_clone_pass.py
index 6108080cb0d..64a1ae8f43e 100644
--- a/backends/arm/passes/remove_clone_pass.py
+++ b/backends/arm/passes/remove_clone_pass.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import torch
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
diff --git a/backends/arm/passes/size_adjust_conv2d_pass.py b/backends/arm/passes/size_adjust_conv2d_pass.py
new file mode 100644
index 00000000000..980ab09e597
--- /dev/null
+++ b/backends/arm/passes/size_adjust_conv2d_pass.py
@@ -0,0 +1,131 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import cast, Optional
+
+import torch.fx
+from executorch.backends.arm.tosa_quant_utils import is_quant_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch._ops import OpOverload
+
+
+def conv_remainder(input_length, pad, dilation, weight, stride):
+    """
+    Returns the size
+    """
+    return (input_length + 2 * pad - dilation * (weight - 1) - 1) % stride
+
+
+def insert_q_dq_pair(
+    graph: torch.fx.Graph,
+    anchor: torch.fx.Node,
+    q_params: tuple,
+):
+    with graph.inserting_after(anchor):
+        q = create_node(
+            graph=graph,
+            op_target=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(),  # We add the argument last
+        )
+        q.meta = anchor.meta
+
+    with graph.inserting_after(q):
+        dq = create_node(
+            graph=graph,
+            op_target=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(q,) + q_params,
+        )
+        dq.meta = q.meta
+
+    anchor.replace_all_uses_with(dq)
+    # We add this last so the replace all uses above does not replace the quantized
+    # node's first use
+    q.args = (anchor,) + q_params
+    return dq
+
+
+def create_node(
+    graph: torch.fx.Graph,
+    op_target: OpOverload,
+    args: tuple = (),
+    kwargs: Optional[dict] = None,
+):
+    return graph.create_node(
+        "call_function",
+        op_target,
+        args=args,
+        kwargs=kwargs or {},
+    )
+
+
+class SizeAdjustConv2DPass(ExportPass):
+    """
+    Adjust the convolution input size to match perfectly with the
+    weight size, padding, stride and dilation parameters.
+    This is done by inserting a slice op to remove the uneven end of the input.
+    """
+
+    conv2d_op = exir_ops.edge.aten.convolution.default
+    slice_op = exir_ops.edge.aten.slice_copy.Tensor
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        modified_graph = False
+        for node in graph.nodes:
+            if node.op != "call_function":
+                continue
+            if node.target != self.conv2d_op:
+                continue
+
+            conv_node = cast(torch.fx.Node, node)
+            input_node, weight, _, stride_hw, pad_hw, dilation_hw, _, _, _ = (
+                conv_node.args
+            )
+            weight_shape = cast(torch.fx.Node, weight).meta["val"].shape
+            input_shape = cast(torch.fx.Node, input_node).meta["val"].shape
+
+            slice_args = []
+            for stride, pad, dilation, dim in zip(
+                cast(list, stride_hw),
+                cast(list, pad_hw),
+                cast(list, dilation_hw),
+                (2, 3),
+            ):
+                remainder = conv_remainder(
+                    input_shape[dim], pad, dilation, weight_shape[dim], stride
+                )
+                if remainder > pad:
+                    adjustment = remainder - pad
+                    args = (dim, 0, input_shape[dim] - adjustment)
+                    slice_args.append(args)
+            if len(slice_args) == 0:
+                continue
+
+            with graph_module.graph.inserting_before(node):
+                last_node = cast(torch.fx.Node, input_node)
+                for args in slice_args:
+                    slice_node = graph.create_node(
+                        "call_function", self.slice_op, (last_node,) + args
+                    )
+                    if is_quant_node(last_node):
+                        q_params = last_node.args[1:]
+                        dq_node = insert_q_dq_pair(
+                            graph_module.graph, slice_node, q_params
+                        )
+                        last_node = dq_node
+                    else:
+                        last_node = slice_node
+                conv_node.replace_input_with(cast(torch.fx.Node, input_node), last_node)
+                modified_graph = True
+
+        if modified_graph:
+            graph_module = super().call(graph_module).graph_module
+            graph.eliminate_dead_code()
+            graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/arm/passes/tag_io_quant_pass.py b/backends/arm/passes/tag_io_quant_pass.py
index d2bf74462ed..2fce6cf3fd4 100644
--- a/backends/arm/passes/tag_io_quant_pass.py
+++ b/backends/arm/passes/tag_io_quant_pass.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import torch
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
diff --git a/backends/arm/quantizer/TARGETS b/backends/arm/quantizer/TARGETS
new file mode 100644
index 00000000000..840586488bf
--- /dev/null
+++ b/backends/arm/quantizer/TARGETS
@@ -0,0 +1,31 @@
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+
+python_library(
+    name = "arm_quantizer",
+    srcs = ["arm_quantizer.py"],
+    typing = True,
+    deps = [
+        ":arm_quantizer_utils",
+        "//caffe2:torch",
+        "//executorch/backends/arm/quantizer/quantization_annotation:quantization_annotation",
+        "//executorch/exir:lib",
+    ],
+)
+
+python_library(
+    name = "quantization_config",
+    srcs = ["quantization_config.py"],
+    typing = True,
+    deps = [
+        "//caffe2:torch",
+    ],
+)
+
+python_library(
+    name = "arm_quantizer_utils",
+    srcs = ["arm_quantizer_utils.py"],
+    typing = True,
+    deps = [
+        ":quantization_config",
+    ],
+)
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
index 8d5edf386a0..853fd47c29c 100644
--- a/backends/arm/quantizer/arm_quantizer.py
+++ b/backends/arm/quantizer/arm_quantizer.py
@@ -5,6 +5,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 #
 # Quantizer for Arm backend
 #
@@ -267,6 +269,8 @@ class ArmQuantizer(Quantizer):
         "mul",
         "sigmoid",
         "mm",
+        "cat",
+        "one_to_one",
     ]
 
     def __init__(self) -> None:
diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py
index c5da32a40ad..fe9c5e34e6b 100644
--- a/backends/arm/quantizer/arm_quantizer_utils.py
+++ b/backends/arm/quantizer/arm_quantizer_utils.py
@@ -5,12 +5,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 #
 # Utility functions for ArmQuantizer
 #
 
 import operator
-from typing import Callable, cast, List
+from typing import Callable, cast, List, Union
 
 import torch
 from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
@@ -72,7 +74,7 @@ def get_shared_qspec(
 
         Both outputs are None if one of the inputs is a node that can't be quantized.
     """
-    input_act0 = node.args[0]
+    input_act0 = cast(Node, node.args[0])
     input_act1 = node.args[1]
 
     input_act_qspec = quantization_config.get_input_act_qspec()
@@ -102,12 +104,19 @@ def is_input_ok_for_quantization(input_act: Node, gm: GraphModule):
     )
 
 
+def get_node_target(module: torch.nn.Module | GraphModule, target_str: str):
+    targets = target_str.split(".")
+    for target in targets[:-1]:
+        module = module.get_submodule(target)
+    return getattr(module, targets[-1])
+
+
 def is_input_large_scalar(node: Node, gm: GraphModule):
     """Check if input is a large scalar value. So that we can skip quantization for the node
     since histc op (in HistogramObserver) only works for values up to certain upper bound
     """
     if node.op == "get_attr" and isinstance(node.target, str):
-        tensor = getattr(gm, node.target)
+        tensor = get_node_target(gm, node.target)
         # torch.histc works until this upper bound
         HISTC_UPPER_BOUND = 3.4028235e15
         return tensor.numel() == 1 and abs(tensor.item()) > HISTC_UPPER_BOUND
@@ -131,6 +140,7 @@ def is_share_obs_or_fq_op(op: Callable) -> bool:
     return op in [
         torch.ops.aten.hardtanh.default,
         torch.ops.aten.hardtanh_.default,
+        torch.ops.aten.relu.default,
         torch.ops.aten.mean.default,
         torch.ops.aten.mean.dim,
         torch.ops.aten.permute.default,
@@ -161,7 +171,9 @@ def propagate_annotation(model: GraphModule) -> None:
         n = cast(Node, n)
         if is_annotated(n):
             continue
-        if n.op != "call_function" or not is_share_obs_or_fq_op(n.target):
+        if n.op != "call_function" or not is_share_obs_or_fq_op(
+            cast(Callable, n.target)
+        ):
             continue
 
         prev_node = n.args[0]
@@ -209,7 +221,7 @@ def convert_scalars_to_attrs(model: GraphModule) -> GraphModule:
             prefix = "_tensor_constant_"
             get_new_attr_name = get_new_attr_name_with_prefix(prefix)
             tensor_constant_name = get_new_attr_name(model)
-            float_tensor = torch.tensor(float(args[i]))
+            float_tensor = torch.tensor(float(cast(Union[int, float], args[i])))
             model.register_buffer(tensor_constant_name, float_tensor)
             fake_mode = n.meta["val"].fake_mode
             with model.graph.inserting_before(n):
diff --git a/backends/arm/quantizer/quantization_annotation/TARGETS b/backends/arm/quantizer/quantization_annotation/TARGETS
new file mode 100644
index 00000000000..4ce8b5cad2c
--- /dev/null
+++ b/backends/arm/quantizer/quantization_annotation/TARGETS
@@ -0,0 +1,12 @@
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+
+python_library(
+    name = "quantization_annotation",
+    srcs = glob(["*.py"]),
+    typing = True,
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/arm/quantizer:arm_quantizer_utils",
+        "//executorch/backends/arm/quantizer:quantization_config",
+    ],
+)
diff --git a/backends/arm/quantizer/quantization_annotation/__init__.py b/backends/arm/quantizer/quantization_annotation/__init__.py
index 60808d2f234..f7219201dec 100644
--- a/backends/arm/quantizer/quantization_annotation/__init__.py
+++ b/backends/arm/quantizer/quantization_annotation/__init__.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 
 from typing import Callable, Dict, List, NamedTuple, Optional
 
@@ -49,11 +51,13 @@ def decorator(annotator: AnnotatorType):
 from . import (  # noqa
     adaptive_ang_pool2d_annotator,
     add_annotator,
+    cat_annotator,
     conv_annotator,
     linear_annotator,
     max_pool2d_annotator,
     mm_annotator,
     mul_annotator,
+    one_to_one_annotator,
     sigmoid_annotator,
     sub_annotator,
 )
diff --git a/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py b/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py
index acbdc45b6b9..723a48f6644 100644
--- a/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py
+++ b/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import itertools
 from typing import Callable, List, Optional
 
diff --git a/backends/arm/quantizer/quantization_annotation/add_annotator.py b/backends/arm/quantizer/quantization_annotation/add_annotator.py
index 2926e92f243..35801bd5681 100644
--- a/backends/arm/quantizer/quantization_annotation/add_annotator.py
+++ b/backends/arm/quantizer/quantization_annotation/add_annotator.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import itertools
 import operator
 from typing import Callable, List, Optional
diff --git a/backends/arm/quantizer/quantization_annotation/cat_annotator.py b/backends/arm/quantizer/quantization_annotation/cat_annotator.py
new file mode 100644
index 00000000000..6e138cd9def
--- /dev/null
+++ b/backends/arm/quantizer/quantization_annotation/cat_annotator.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import itertools
+from typing import Callable, cast, List, Optional
+
+import torch.fx
+from executorch.backends.arm.quantizer import arm_quantizer_utils
+from executorch.backends.arm.quantizer.quantization_annotation import register_annotator
+from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
+from torch.ao.quantization.quantizer import (
+    QuantizationAnnotation,
+    SharedQuantizationSpec,
+)
+from torch.fx import Node
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+
+
+@register_annotator("cat")
+def _annotate_cat(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig,
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    cat_partitions = get_source_partitions(gm.graph, [torch.cat], filter_fn)
+    cat_partitions = list(itertools.chain.from_iterable(cat_partitions.values()))
+    annotated_partitions = []
+    for cat_partition in cat_partitions:
+        annotated_partitions.append(cat_partition.nodes)
+        cat_node = cat_partition.output_nodes[0]
+        if arm_quantizer_utils.is_annotated(cat_node):
+            continue
+
+        input_acts = cast(list[torch.fx.Node], cat_node.args[0])
+        input_act0 = input_acts[0]
+
+        input_act_qspec = quantization_config.get_input_act_qspec()
+        shared_with_input0_qspec = SharedQuantizationSpec((input_act0, cat_node))
+
+        input_qspec_map = {}
+
+        # First input is set to input qspec from the quantization config.
+        if isinstance(input_act0, Node):
+            if not arm_quantizer_utils.is_input_ok_for_quantization(input_act0, gm):
+                continue
+            input_qspec_map[input_act0] = input_act_qspec
+
+        # For the rest of the inputs, share qspec with first.
+        # If we can't quantize any of the inputs, abort annotation.
+        for input_act in input_acts[1:]:
+            if isinstance(input_act, Node):
+                if not arm_quantizer_utils.is_input_ok_for_quantization(input_act, gm):
+                    continue
+                if input_act is not input_act0:
+                    input_qspec_map[input_act] = shared_with_input0_qspec
+
+        if input_qspec_map is not None:
+            cat_node.meta["quantization_annotation"] = QuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                output_qspec=shared_with_input0_qspec,
+                _annotated=True,
+            )
+    return annotated_partitions
diff --git a/backends/arm/quantizer/quantization_annotation/conv_annotator.py b/backends/arm/quantizer/quantization_annotation/conv_annotator.py
index 40a1f1ee9ea..4ff7dd9e800 100644
--- a/backends/arm/quantizer/quantization_annotation/conv_annotator.py
+++ b/backends/arm/quantizer/quantization_annotation/conv_annotator.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.f
 
+# pyre-unsafe
+
 from typing import Callable, List, Optional
 
 import torch
diff --git a/backends/arm/quantizer/quantization_annotation/linear_annotator.py b/backends/arm/quantizer/quantization_annotation/linear_annotator.py
index 95b881a9548..7c3f91ec707 100644
--- a/backends/arm/quantizer/quantization_annotation/linear_annotator.py
+++ b/backends/arm/quantizer/quantization_annotation/linear_annotator.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 from typing import Callable, List, Optional
 
 import torch
diff --git a/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py b/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py
index 3d9d8b2e6c8..0ef2ee39fe5 100644
--- a/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py
+++ b/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import itertools
 from typing import Callable, List, Optional
 
diff --git a/backends/arm/quantizer/quantization_annotation/mm_annotator.py b/backends/arm/quantizer/quantization_annotation/mm_annotator.py
index 969f0131ffd..b48c6d59905 100644
--- a/backends/arm/quantizer/quantization_annotation/mm_annotator.py
+++ b/backends/arm/quantizer/quantization_annotation/mm_annotator.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import itertools
 from typing import Callable, List, Optional
 
@@ -22,7 +24,7 @@ def _annotate_mm(
     quantization_config: QuantizationConfig,
     filter_fn: Optional[Callable[[Node], bool]] = None,
 ) -> Optional[List[List[Node]]]:
-    mm_partitions = get_source_partitions(gm.graph, [torch.mm], filter_fn)
+    mm_partitions = get_source_partitions(gm.graph, [torch.mm, torch.bmm], filter_fn)
     mm_partitions = list(itertools.chain.from_iterable(mm_partitions.values()))
     annotated_partitions = []
     for mm_partition in mm_partitions:
diff --git a/backends/arm/quantizer/quantization_annotation/mul_annotator.py b/backends/arm/quantizer/quantization_annotation/mul_annotator.py
index 6ec8f95531b..4717eac320d 100644
--- a/backends/arm/quantizer/quantization_annotation/mul_annotator.py
+++ b/backends/arm/quantizer/quantization_annotation/mul_annotator.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import itertools
 import operator
 from typing import Callable, List, Optional
diff --git a/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py b/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py
new file mode 100644
index 00000000000..8d507c11ef3
--- /dev/null
+++ b/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import Callable, List, Optional
+
+import torch
+import torch.fx
+from executorch.backends.arm.quantizer import arm_quantizer_utils
+from executorch.backends.arm.quantizer.quantization_annotation import register_annotator
+from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
+from torch.ao.quantization.quantizer.utils import (
+    _annotate_input_qspec_map,
+    _annotate_output_qspec,
+)
+from torch.fx import Node
+
+
+@register_annotator("one_to_one")
+def _annotate_one_to_one(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig,
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    """
+    This annotator adds the input and output qspec from the quantization config to
+    ops in 'one_to_one_ops' that have the following properties:
+    - Have a single input and single output.
+    - Can handle different qspecs on the input and output.
+
+    Typical ops are ops implemented with a lookup table.
+    """
+    annotated_partitions = []
+    one_to_one_ops = (torch.ops.aten.exp.default, torch.ops.aten.log.default)
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target not in one_to_one_ops:
+            continue
+        if filter_fn and not filter_fn(node):
+            continue
+        input_node = node.args[0]
+
+        if not arm_quantizer_utils.is_annotated(node):
+            _annotate_input_qspec_map(
+                node,
+                input_node,
+                quantization_config.get_input_act_qspec(),
+            )
+            _annotate_output_qspec(node, quantization_config.get_output_act_qspec())
+
+            arm_quantizer_utils.mark_nodes_as_annotated([node])
+            annotated_partitions.append([node])
+
+    return annotated_partitions
diff --git a/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py b/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py
index bd683d81f0b..3d242694836 100644
--- a/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py
+++ b/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 from typing import Callable, List, Optional
 
 import torch
diff --git a/backends/arm/quantizer/quantization_annotation/sub_annotator.py b/backends/arm/quantizer/quantization_annotation/sub_annotator.py
index 4686d480edb..92f1808d023 100644
--- a/backends/arm/quantizer/quantization_annotation/sub_annotator.py
+++ b/backends/arm/quantizer/quantization_annotation/sub_annotator.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import itertools
 import operator
 from typing import Callable, List, Optional
diff --git a/backends/arm/quantizer/quantization_config.py b/backends/arm/quantizer/quantization_config.py
index f94c3e18da6..1e776d37a6f 100644
--- a/backends/arm/quantizer/quantization_config.py
+++ b/backends/arm/quantizer/quantization_config.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 from dataclasses import dataclass
 
 import torch
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index 7420874d8f4..6d9ab6b0091 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -11,9 +11,9 @@
  */
 
 #include <cstring>
+#include <memory>
 
 #include <ethosu_driver.h>
-#include <pmu_ethosu.h>
 
 #include "executorch/backends/arm/runtime/VelaBinStream.h"
 #include "executorch/runtime/backend/interface.h"
@@ -31,7 +31,22 @@ typedef struct {
   bool permuted_io_flag;
 } ExecutionHandle;
 
-class ArmBackend final : public PyTorchBackendInterface {
+extern "C" {
+void __attribute__((weak)) ArmBackend_execute_begin() {}
+void __attribute__((weak)) ArmBackend_execute_end() {}
+}
+
+class ArmBackendExecuteCallbacks {
+ public:
+  ArmBackendExecuteCallbacks() {
+    ArmBackend_execute_begin();
+  }
+  ~ArmBackendExecuteCallbacks() {
+    ArmBackend_execute_end();
+  }
+};
+
+class ArmBackend final : public ::executorch::runtime::BackendInterface {
  public:
   ArmBackend() {}
 
@@ -82,6 +97,7 @@ class ArmBackend final : public PyTorchBackendInterface {
     ExecutionHandle* execution_handle = (ExecutionHandle*)input_handle;
     VelaHandles handles;
 
+    ArmBackendExecuteCallbacks ArmBackend_execute_callbacks;
     // Command stream - we know at this point it's aligned
     char* data = (char*)execution_handle->processed->data();
     ET_LOG(Info, "ArmBackend::execute %p", data);
@@ -147,8 +163,9 @@ class ArmBackend final : public PyTorchBackendInterface {
       if (both_char and permuted_input_shape) {
         // permuted byte copy CHW to HWC
         permute_CHW_to_HWC(
-            scratch_addr,
             tensor_in.mutable_data_ptr<char>(),
+            scratch_addr,
+            tensor_in.size(1),
             tensor_in.size(2),
             tensor_in.size(3));
       } else if (both_char or both_int) {
@@ -164,8 +181,10 @@ class ArmBackend final : public PyTorchBackendInterface {
     }
 
     // Allocate driver handle and synchronously invoke driver
-    ethosu_driver* drv = ethosu_reserve_driver();
-    if (drv == NULL) {
+    auto driver =
+        std::unique_ptr<ethosu_driver, decltype(&ethosu_release_driver)>(
+            ethosu_reserve_driver(), ethosu_release_driver);
+    if (driver == NULL) {
       ET_LOG(Error, "ArmBackend::execute: ethosu_reserve_driver failed");
       return Error::InvalidState;
     }
@@ -178,7 +197,7 @@ class ArmBackend final : public PyTorchBackendInterface {
     size_t bases_size[2] = {
         handles.weight_data_size, handles.scratch_data_size};
     int result = ethosu_invoke_v3(
-        drv,
+        driver.get(),
         (void*)handles.cmd_data,
         handles.cmd_data_size,
         bases,
@@ -201,17 +220,34 @@ class ArmBackend final : public PyTorchBackendInterface {
       // Process input EValue into scratch
       // Outputs are in the index immediately after inputs
       auto tensor_out = args[handles.inputs->count + i]->toTensor();
-      for (int j = 0; j < tensor_out.numel(); j++) {
-        if (tensor_out.scalar_type() == ScalarType::Char) {
-          char* output_address = (char*)output_addr;
-          tensor_out.mutable_data_ptr<char>()[j] = output_address[j];
-        } else {
-          int* output_address = (int*)output_addr;
-          tensor_out.mutable_data_ptr<int>()[j] = output_address[j];
+      bool permuted_output_shape;
+      ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute(
+          i,
+          tensor_out,
+          &handles.outputs->io[i],
+          execution_handle->permuted_io_flag,
+          &permuted_output_shape));
+      if (tensor_out.scalar_type() == ScalarType::Char and
+          permuted_output_shape) {
+        char* output_address = (char*)output_addr;
+        permute_HWC_to_CHW(
+            output_address,
+            tensor_out.mutable_data_ptr<char>(),
+            tensor_out.size(1),
+            tensor_out.size(2),
+            tensor_out.size(3));
+      } else {
+        for (int j = 0; j < tensor_out.numel(); j++) {
+          if (tensor_out.scalar_type() == ScalarType::Char) {
+            char* output_address = (char*)output_addr;
+            tensor_out.mutable_data_ptr<char>()[j] = output_address[j];
+          } else {
+            int* output_address = (int*)output_addr;
+            tensor_out.mutable_data_ptr<int>()[j] = output_address[j];
+          }
         }
       }
     }
-
     return Error::Ok;
   }
 
@@ -222,51 +258,71 @@ class ArmBackend final : public PyTorchBackendInterface {
  private:
   Error check_requires_permute(
       int index,
-      const exec_aten::Tensor tensor_in,
-      VelaIO* input,
+      const exec_aten::Tensor tensor,
+      VelaIO* io,
       bool permuted_io_flag,
       bool* is_permuted) const {
-    bool permuted_input_shape = false;
-    if (tensor_in.dim() == 4) {
+    bool permuted_shape = false;
+    if (tensor.dim() == 4) {
       // special case for NHWC workaround in AOT; as the compilation has
       // permuted to channel last in an undetectable way, we assume here
-      // that the application has similarly permuted any input tensors.
-      permuted_input_shape = tensor_in.size(0) == input->shape[0] &&
-          tensor_in.size(1) == input->shape[3] &&
-          tensor_in.size(2) == input->shape[1] &&
-          tensor_in.size(3) == input->shape[2];
-      if (permuted_input_shape) {
-        ET_LOG(Info, "Tensor input %d will be permuted", index);
+      // that the application has similarly permuted any input/output tensors.
+      permuted_shape = tensor.size(0) == io->shape[0] &&
+          tensor.size(1) == io->shape[3] && tensor.size(2) == io->shape[1] &&
+          tensor.size(3) == io->shape[2];
+      if (permuted_shape) {
+        ET_LOG(Info, "Tensor input/output %d will be permuted", index);
       }
-      if (permuted_io_flag != permuted_input_shape) {
-        ET_LOG(Error, "Permute compile flag and permuted input don't agree");
+      if (permuted_io_flag != permuted_shape) {
+        ET_LOG(
+            Error,
+            "Permute compile flag and permuted input/output don't agree");
         return Error::InvalidProgram;
       }
     }
-    if (!permuted_input_shape) {
-      // Error check matching shapes in the general case
-      for (int i = 0; i < tensor_in.dim(); i++) {
-        if (tensor_in.size(i) != input->shape[i]) {
-          ET_LOG(Error, "Tensor input %d mismatched shape", index);
-          ET_LOG(
-              Error,
-              "dimension %d mismatch, %zd != %d",
-              index,
-              tensor_in.size(i),
-              input->shape[i]);
-          return Error::InvalidProgram;
-        }
+    if (!permuted_shape) {
+      // Check the number of elements in each tensor match
+      int tensor_count = 1;
+      int io_count = 1;
+
+      for (int i = 0; i < tensor.dim(); i++) {
+        tensor_count = tensor_count * tensor.size(i);
+      }
+
+      // The VelaIO type has a shape of fixed size 4
+      for (int i = 0; i < 4; i++) {
+        io_count = io_count * io->shape[i];
+      }
+
+      if (tensor_count != io_count) {
+        ET_LOG(Error, "Input tensor sizes do not match");
+        ET_LOG(
+            Error,
+            "Program expects %d elements but got %d",
+            io_count,
+            tensor_count);
+        return Error::InvalidProgram;
       }
     }
-    *is_permuted = permuted_input_shape;
+    *is_permuted = permuted_shape;
     return Error::Ok;
   }
 
-  void permute_CHW_to_HWC(char* input, char* output, int H, int W) const {
+  void permute_CHW_to_HWC(char* input, char* output, int C, int H, int W)
+      const {
     for (int i = 0; i != H * W; ++i) {
-      output[i * 3 + 0] = input[i + 0 * W * H];
-      output[i * 3 + 1] = input[i + 1 * W * H];
-      output[i * 3 + 2] = input[i + 2 * W * H];
+      for (int j = 0; j < C; ++j) {
+        output[i * C + j] = input[i + j * W * H];
+      }
+    }
+  }
+
+  void permute_HWC_to_CHW(char* input, char* output, int C, int H, int W)
+      const {
+    for (int i = 0; i != H * W; ++i) {
+      for (int j = 0; j < C; ++j) {
+        output[i + j * W * H] = input[i * C + j];
+      }
     }
   }
 };
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index f85fd1f2dac..0d50f1882da 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -14,6 +14,7 @@
 import torch
 
 from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 
 _enabled_options: list[str] = []
 
@@ -85,7 +86,9 @@ def is_option_enabled(option: str, fail_if_not_enabled: bool = False) -> bool:
             return False
 
 
-def get_tosa_compile_spec(permute_memory_to_nhwc=True, custom_path=None):
+def get_tosa_compile_spec(
+    permute_memory_to_nhwc=True, custom_path=None
+) -> list[CompileSpec]:
     """
     Default compile spec for TOSA tests.
     """
@@ -112,8 +115,8 @@ def get_tosa_compile_spec_unbuilt(
 
 
 def get_u55_compile_spec(
-    permute_memory_to_nhwc=False, quantize_io=False, custom_path=None
-):
+    permute_memory_to_nhwc=True, quantize_io=False, custom_path=None
+) -> list[CompileSpec]:
     """
     Default compile spec for Ethos-U55 tests.
     """
@@ -122,10 +125,21 @@ def get_u55_compile_spec(
     ).build()
 
 
+def get_u85_compile_spec(
+    permute_memory_to_nhwc=True, quantize_io=False, custom_path=None
+) -> list[CompileSpec]:
+    """
+    Default compile spec for Ethos-U85 tests.
+    """
+    return get_u85_compile_spec_unbuilt(
+        permute_memory_to_nhwc, quantize_io=quantize_io, custom_path=custom_path
+    ).build()
+
+
 def get_u55_compile_spec_unbuilt(
-    permute_memory_to_nhwc=False, quantize_io=False, custom_path=None
+    permute_memory_to_nhwc=True, quantize_io=False, custom_path=None
 ) -> ArmCompileSpecBuilder:
-    """Get the ArmCompileSpecBuilder for the default TOSA tests, to modify
+    """Get the ArmCompileSpecBuilder for the Ethos-U55 tests, to modify
     the compile spec before calling .build() to finalize it.
     """
     artifact_path = custom_path or tempfile.mkdtemp(prefix="arm_u55_")
@@ -137,7 +151,29 @@ def get_u55_compile_spec_unbuilt(
             "ethos-u55-128",
             system_config="Ethos_U55_High_End_Embedded",
             memory_mode="Shared_Sram",
-            extra_flags=None,
+            extra_flags="--debug-force-regor --output-format=raw",
+        )
+        .set_quantize_io(is_option_enabled("quantize_io") or quantize_io)
+        .set_permute_memory_format(permute_memory_to_nhwc)
+        .dump_intermediate_artifacts_to(artifact_path)
+    )
+    return compile_spec
+
+
+def get_u85_compile_spec_unbuilt(
+    permute_memory_to_nhwc=True, quantize_io=False, custom_path=None
+) -> list[CompileSpec]:
+    """Get the ArmCompileSpecBuilder for the Ethos-U85 tests, to modify
+    the compile spec before calling .build() to finalize it.
+    """
+    artifact_path = custom_path or tempfile.mkdtemp(prefix="arm_u85_")
+    compile_spec = (
+        ArmCompileSpecBuilder()
+        .ethosu_compile_spec(
+            "ethos-u85-128",
+            system_config="Ethos_U85_SYS_DRAM_Mid",
+            memory_mode="Shared_Sram",
+            extra_flags="--output-format=raw",
         )
         .set_quantize_io(is_option_enabled("quantize_io") or quantize_io)
         .set_permute_memory_format(permute_memory_to_nhwc)
diff --git a/backends/arm/test/misc/test_lifted_tensor.py b/backends/arm/test/misc/test_lifted_tensor.py
new file mode 100644
index 00000000000..90aa7e2950c
--- /dev/null
+++ b/backends/arm/test/misc/test_lifted_tensor.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+
+class LiftedTensor(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.lifted_tensor = torch.Tensor([[1, 2], [3, 4]])
+
+    def forward(self, x: torch.Tensor, length) -> torch.Tensor:
+        sliced = self.lifted_tensor[:, :length]
+        return sliced + x
+
+
+class TestLiftedTensor(unittest.TestCase):
+    """Tests the ArmPartitioner with a placeholder of type lifted tensor."""
+
+    def test_partition_lifted_tensor(self):
+        tester = (
+            ArmTester(
+                LiftedTensor(),
+                example_inputs=(torch.ones(2, 2), 2),
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .to_edge()
+            .dump_artifact()
+        )
+        signature = tester.get_artifact().exported_program().graph_signature
+        assert len(signature.lifted_tensor_constants) > 0
+        tester.partition()
+        tester.to_executorch()
+        tester.run_method_and_compare_outputs((torch.ones(2, 2), 2))
diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py
index 248153a5180..f9d408c1bae 100644
--- a/backends/arm/test/models/test_mobilenet_v2_arm.py
+++ b/backends/arm/test/models/test_mobilenet_v2_arm.py
@@ -84,7 +84,7 @@ def test_mv2_tosa_BI(self):
         )
 
     def test_mv2_u55_BI(self):
-        (
+        tester = (
             ArmTester(
                 self.mv2,
                 example_inputs=self.model_inputs,
@@ -96,4 +96,24 @@ def test_mv2_u55_BI(self):
             .check(list(self.operators_after_quantization))
             .partition()
             .to_executorch()
+            .serialize()
+        )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0, qtol=1, inputs=self.model_inputs
+            )
+
+    def test_mv2_u85_BI(self):
+        (
+            ArmTester(
+                self.mv2,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            )
+            .quantize()
+            .export()
+            .to_edge(config=self._edge_compile_config)
+            .check(list(self.operators_after_quantization))
+            .partition()
+            .to_executorch()
         )
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index 3bd2b2605c4..cff8af11654 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -13,6 +13,7 @@
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir import EdgeCompileConfig
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -37,9 +38,9 @@ class Add2(torch.nn.Module):
                 torch.FloatTensor([1, 2, 3, 5, 7]),
                 (torch.FloatTensor([2, 1, 2, 1, 10])),
             ),
-            (torch.ones(1, 1, 4, 4), torch.ones(1, 1, 4, 4)),
+            (torch.ones(1, 10, 4, 6), torch.ones(1, 10, 4, 6)),
             (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
-            (torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
+            (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)),
             (10000 * torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
         ]
 
@@ -92,16 +93,17 @@ def _test_add_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_add_u55_BI_pipeline(
+    def _test_add_ethos_BI_pipeline(
         self,
         module: torch.nn.Module,
+        compile_spec: CompileSpec,
         test_data: Tuple[torch.Tensor],
     ):
         tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -114,8 +116,7 @@ def _test_add_u55_BI_pipeline(
             .serialize()
         )
 
-        if common.is_option_enabled("corstone300"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+        return tester
 
     @parameterized.expand(Add.test_parameters)
     def test_add_tosa_MI(self, test_data: torch.Tensor):
@@ -130,7 +131,22 @@ def test_add_tosa_BI(self, test_data: torch.Tensor):
     @parameterized.expand(Add.test_parameters)
     def test_add_u55_BI(self, test_data: torch.Tensor):
         test_data = (test_data,)
-        self._test_add_u55_BI_pipeline(self.Add(), test_data)
+        tester = self._test_add_ethos_BI_pipeline(
+            self.Add(),
+            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            test_data,
+        )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+
+    @parameterized.expand(Add.test_parameters)
+    def test_add_u85_BI(self, test_data: torch.Tensor):
+        test_data = (test_data,)
+        self._test_add_ethos_BI_pipeline(
+            self.Add(),
+            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            test_data,
+        )
 
     @parameterized.expand(Add2.test_parameters)
     def test_add2_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
@@ -145,4 +161,15 @@ def test_add2_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
     @parameterized.expand(Add2.test_parameters)
     def test_add2_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
-        self._test_add_u55_BI_pipeline(self.Add2(), test_data)
+        tester = self._test_add_ethos_BI_pipeline(
+            self.Add2(), common.get_u55_compile_spec(), test_data
+        )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+
+    @parameterized.expand(Add2.test_parameters)
+    def test_add2_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_add_ethos_BI_pipeline(
+            self.Add2(), common.get_u85_compile_spec(), test_data
+        )
diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py
index 32a0e5555a3..6c14420dbcf 100644
--- a/backends/arm/test/ops/test_avg_pool.py
+++ b/backends/arm/test/ops/test_avg_pool.py
@@ -13,6 +13,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -86,14 +87,17 @@ def _test_avgpool2d_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_avgpool2d_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    def _test_avgpool2d_tosa_ethos_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -141,6 +145,22 @@ def test_avgpool2d_tosa_u55_BI(
         test_data: torch.Tensor,
         model_params: int | Tuple[int, int],
     ):
-        self._test_avgpool2d_tosa_u55_BI_pipeline(
-            self.AvgPool2d(*model_params), (test_data,)
+        self._test_avgpool2d_tosa_ethos_BI_pipeline(
+            self.AvgPool2d(*model_params),
+            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            (test_data,),
+        )
+
+    @parameterized.expand(test_data_suite)
+    @unittest.expectedFailure
+    def test_avgpool2d_tosa_u85_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        model_params: int | Tuple[int, int],
+    ):
+        self._test_avgpool2d_tosa_ethos_BI_pipeline(
+            self.AvgPool2d(*model_params),
+            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            (test_data,),
         )
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
new file mode 100644
index 00000000000..e4e6abb7bb3
--- /dev/null
+++ b/backends/arm/test/ops/test_bmm.py
@@ -0,0 +1,148 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from parameterized import parameterized
+
+torch.manual_seed(1)
+
+
+class TestBMM(unittest.TestCase):
+    """Tests Batch MatMul"""
+
+    class BMM(torch.nn.Module):
+        test_parameters = [
+            (torch.rand(5, 3, 5), torch.rand(5, 5, 2)),
+            (torch.rand(2, 1, 1), torch.rand(2, 1, 1)),
+            (torch.ones(1, 55, 3), torch.ones(1, 3, 44)),
+            (10000 * torch.randn(10, 1, 10), torch.randn(10, 10, 5)),
+            (-10 * torch.randn(2, 32, 64), 5 + 5 * torch.randn(2, 64, 32)),
+        ]
+
+        def forward(self, x, y):
+            return torch.bmm(x, y)
+
+    class BMMSingleInput(torch.nn.Module):
+        test_parameters = [
+            (torch.rand(20, 3, 3),),
+            (torch.ones(2, 128, 128),),
+            (10000 * torch.randn(4, 25, 25),),
+            (5 + 5 * torch.randn(3, 64, 64),),
+        ]
+
+        def forward(self, x):
+            return torch.bmm(x, x)
+
+    def _test_bmm_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .check_count({"torch.ops.aten.bmm.default": 1})
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_bmm_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.bmm.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_bmm_ethosu_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.Tensor, ...],
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.bmm.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(BMM.test_parameters)
+    def test_bmm_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_bmm_tosa_MI_pipeline(self.BMM(), test_data)
+
+    @parameterized.expand(BMMSingleInput.test_parameters)
+    def test_bmm_single_input_tosa_MI(self, operand1: torch.Tensor):
+        test_data = (operand1,)
+        self._test_bmm_tosa_MI_pipeline(self.BMMSingleInput(), test_data)
+
+    @parameterized.expand(BMM.test_parameters)
+    def test_bmm_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data)
+
+    @parameterized.expand(BMMSingleInput.test_parameters)
+    def test_bmm_single_input_tosa_BI(self, operand1: torch.Tensor):
+        test_data = (operand1,)
+        self._test_bmm_tosa_BI_pipeline(self.BMMSingleInput(), test_data)
+
+    @parameterized.expand(BMM.test_parameters)
+    def test_bmm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data)
+
+    # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
+    @parameterized.expand(BMMSingleInput.test_parameters)
+    @unittest.expectedFailure
+    def test_bmm_single_input_u55_BI(self, operand1: torch.Tensor):
+        test_data = (operand1,)
+        self._test_bmm_ethosu_BI_pipeline(
+            self.BMMSingleInput(), common.get_u55_compile_spec(), test_data
+        )
+
+    @parameterized.expand(BMMSingleInput.test_parameters)
+    def test_bmm_single_input_u85_BI(self, operand1: torch.Tensor):
+        test_data = (operand1,)
+        self._test_bmm_ethosu_BI_pipeline(
+            self.BMMSingleInput(), common.get_u85_compile_spec(), test_data
+        )
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
new file mode 100644
index 00000000000..9723ba0f0c0
--- /dev/null
+++ b/backends/arm/test/ops/test_cat.py
@@ -0,0 +1,144 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from parameterized import parameterized
+
+
+class TestCat(unittest.TestCase):
+
+    class Cat(torch.nn.Module):
+        test_parameters = [
+            ((torch.ones(1), torch.ones(1)), 0),
+            ((torch.ones(1, 2), torch.randn(1, 5), torch.randn(1, 1)), 1),
+            (
+                (
+                    torch.ones(1, 2, 5),
+                    torch.randn(1, 2, 4),
+                    torch.randn(1, 2, 2),
+                    torch.randn(1, 2, 1),
+                ),
+                -1,
+            ),
+            ((torch.randn(2, 2, 4, 4), torch.randn(2, 2, 4, 1)), 3),
+            (
+                (
+                    10000 * torch.randn(2, 3, 1, 4),
+                    torch.randn(2, 7, 1, 4),
+                    torch.randn(2, 1, 1, 4),
+                ),
+                -3,
+            ),
+        ]
+
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, tensors: tuple[torch.Tensor, ...], dim: int) -> torch.Tensor:
+            return torch.cat(tensors, dim=dim)
+
+    def _test_cat_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .check_count({"torch.ops.aten.cat.default": 1})
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_cat_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.cat.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
+        )
+
+    def _test_cat_ethosu_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[tuple[torch.Tensor, ...], int],
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.cat.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(Cat.test_parameters)
+    def test_cat_tosa_MI(self, operands: tuple[torch.Tensor, ...], dim: int):
+        test_data = (operands, dim)
+        self._test_cat_tosa_MI_pipeline(self.Cat(), test_data)
+
+    def test_cat_4d_tosa_MI(self):
+        square = torch.ones((2, 2, 2, 2))
+        for dim in range(-3, 3):
+            test_data = ((square, square), dim)
+            self._test_cat_tosa_MI_pipeline(self.Cat(), test_data)
+
+    @parameterized.expand(Cat.test_parameters)
+    def test_cat_tosa_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
+        test_data = (operands, dim)
+        self._test_cat_tosa_BI_pipeline(self.Cat(), test_data)
+
+    @parameterized.expand(Cat.test_parameters)
+    def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
+        test_data = (operands, dim)
+        self._test_cat_ethosu_BI_pipeline(
+            self.Cat(), common.get_u55_compile_spec(), test_data
+        )
+
+    @parameterized.expand(Cat.test_parameters)
+    def test_cat_u85_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
+        test_data = (operands, dim)
+        self._test_cat_ethosu_BI_pipeline(
+            self.Cat(), common.get_u85_compile_spec(), test_data
+        )
diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py
index 8386283f24e..9852c5c4520 100644
--- a/backends/arm/test/ops/test_clone.py
+++ b/backends/arm/test/ops/test_clone.py
@@ -21,6 +21,8 @@
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -76,16 +78,15 @@ def _test_clone_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_clone_tosa_u55_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_clone_tosa_ethos_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
-            )
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .check_count({"torch.ops.aten.clone.default": 1})
@@ -95,6 +96,20 @@ def _test_clone_tosa_u55_pipeline(
             .to_executorch()
         )
 
+    def _test_clone_tosa_u55_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        self._test_clone_tosa_ethos_pipeline(
+            common.get_u55_compile_spec(), module, test_data
+        )
+
+    def _test_clone_tosa_u85_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        self._test_clone_tosa_ethos_pipeline(
+            common.get_u85_compile_spec(), module, test_data
+        )
+
     @parameterized.expand(Clone.test_parameters)
     def test_clone_tosa_MI(self, test_tensor: torch.Tensor):
         self._test_clone_tosa_MI_pipeline(self.Clone(), (test_tensor,))
@@ -106,3 +121,7 @@ def test_clone_tosa_BI(self, test_tensor: torch.Tensor):
     @parameterized.expand(Clone.test_parameters)
     def test_clone_u55_BI(self, test_tensor: torch.Tensor):
         self._test_clone_tosa_u55_pipeline(self.Clone(), (test_tensor,))
+
+    @parameterized.expand(Clone.test_parameters)
+    def test_clone_u85_BI(self, test_tensor: torch.Tensor):
+        self._test_clone_tosa_u85_pipeline(self.Clone(), (test_tensor,))
diff --git a/backends/arm/test/ops/test_conv.py b/backends/arm/test/ops/test_conv.py
index 9ebfe77da2c..286404922f2 100644
--- a/backends/arm/test/ops/test_conv.py
+++ b/backends/arm/test/ops/test_conv.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import unittest
 
 from typing import List, Tuple, Union
@@ -13,11 +12,9 @@
 from executorch.backends.arm.test import common
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
 
 class Conv2d(torch.nn.Module):
     """
@@ -159,14 +156,14 @@ def forward(self, x):
     batches=1,
 )
 
-conv2d_2x2_1x1x14x14_st2 = Conv2d(
+conv2d_2x2_1x1x14x13_st2 = Conv2d(
     in_channels=1,
     out_channels=1,
     kernel_size=(2, 2),
     stride=2,
     padding=0,
     width=14,
-    height=14,
+    height=13,
     batches=1,
 )
 
@@ -192,6 +189,18 @@ def forward(self, x):
     batches=1,
 )
 
+conv2d_5x5_1x3x14x15_st3_pd1 = Conv2d(
+    in_channels=3,
+    out_channels=16,
+    kernel_size=(5, 5),
+    stride=3,
+    padding=1,
+    width=14,
+    height=15,
+    batches=1,
+)
+
+
 two_conv2d_nobias = Conv2d(
     nbr_conv=2,
     width=256,
@@ -225,7 +234,8 @@ def forward(self, x):
     ("3x3_1x3x256x256_st1", conv2d_3x3_1x3x256x256_st1),
     ("3x3_1x3x12x12_st2_pd1", conv2d_3x3_1x3x12x12_st2_pd1),
     ("1x1_1x2x128x128_st1", conv2d_1x1_1x2x128x128_st1),
-    ("2x2_1x1x14x14_st2", conv2d_2x2_1x1x14x14_st2),
+    ("2x2_1x1x14x13_st2_needs_adjust_pass", conv2d_2x2_1x1x14x13_st2),
+    ("conv2d_5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1),
     ("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1),
     ("3x3_1x3x224x224_st2_pd1", conv2d_3x3_1x3x224x224_st2_pd1),
     ("two_conv2d_nobias", two_conv2d_nobias),
@@ -240,7 +250,10 @@ def forward(self, x):
 testsuite_u55.remove(("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1))
 
 # Fails when enabling CompileSpec.set_quantize_io(True). MLETORCH-191.
-testsuite_u55.remove(("2x2_1x1x14x14_st2", conv2d_2x2_1x1x14x14_st2))
+testsuite_u55.remove(("2x2_1x1x14x13_st2_needs_adjust_pass", conv2d_2x2_1x1x14x13_st2))
+testsuite_u55.remove(
+    ("conv2d_5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1)
+)
 
 
 class TestConv2D(unittest.TestCase):
@@ -285,14 +298,17 @@ def _test_conv2d_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_conv2d_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_conv2d_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -313,4 +329,16 @@ def test_conv2d_tosa_BI(self, test_name, model):
 
     @parameterized.expand(testsuite_u55)
     def test_conv2d_u55_BI(self, test_name, model):
-        self._test_conv2d_u55_BI_pipeline(model, model.get_inputs())
+        self._test_conv2d_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            model,
+            model.get_inputs(),
+        )
+
+    @parameterized.expand(testsuite_u55)
+    def test_conv2d_u85_BI(self, test_name, model):
+        self._test_conv2d_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            model,
+            model.get_inputs(),
+        )
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index 88006df1a01..1fe4f1b5a57 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -12,6 +12,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -102,7 +103,7 @@ def forward(self, x):
         return self.adaptive_avg_pool2d(x)
 
 
-class ComboConvBatchnormRelu(torch.nn.Module):
+class ComboConvBatchnormRelu6(torch.nn.Module):
     edge_op_list = [
         "executorch_exir_dialects_edge__ops_aten_convolution_default",
         "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default",
@@ -199,14 +200,17 @@ def _test_conv_combo_tosa_BI_pipeline(
             )
         )
 
-    def _test_conv_combo_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_conv_combo_ethos_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.Tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -230,22 +234,44 @@ def test_conv_meandim_tosa_BI(self):
 
     def test_conv_meandim_u55_BI(self):
         model = ComboConv2dMeandim()
-        self._test_conv_combo_u55_BI_pipeline(model, model.get_inputs())
+        self._test_conv_combo_ethos_BI_pipeline(
+            model,
+            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            model.get_inputs(),
+        )
+
+    def test_conv_meandim_u85_BI(self):
+        model = ComboConv2dMeandim()
+        self._test_conv_combo_ethos_BI_pipeline(
+            model,
+            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            model.get_inputs(),
+        )
 
     ##############################
     ## Conv + batch norm + relu ##
     ##############################
-    def test_conv_batchnorm_relu_tosa_MI(self):
-        model = ComboConvBatchnormRelu()
+    def test_conv_batchnorm_relu6_tosa_MI(self):
+        model = ComboConvBatchnormRelu6()
         self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs())
 
-    def test_conv_batchnorm_relu_tosa_BI(self):
-        model = ComboConvBatchnormRelu()
+    def test_conv_batchnorm_relu6_tosa_BI(self):
+        model = ComboConvBatchnormRelu6()
         self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs())
 
-    def test_conv_batchnorm_relu_u55_BI(self):
-        model = ComboConvBatchnormRelu()
-        self._test_conv_combo_u55_BI_pipeline(model, model.get_inputs())
+    def test_conv_batchnorm_relu6_u55_BI(self):
+        model = ComboConvBatchnormRelu6()
+        self._test_conv_combo_ethos_BI_pipeline(
+            model, common.get_u55_compile_spec(), model.get_inputs()
+        )
+
+    def test_conv_batchnorm_relu_u85_BI(self):
+        model = ComboConvBatchnormRelu6()
+        self._test_conv_combo_ethos_BI_pipeline(
+            model,
+            common.get_u85_compile_spec(),
+            model.get_inputs(),
+        )
 
     ##################
     ## Conv + ReLU6 ##
@@ -266,7 +292,17 @@ def test_conv_relu6_tosa_BI(self, test_data: torch.Tensor):
     def test_conv_relu6_u55_BI(self, test_data: torch.Tensor):
         model = ComboConvRelu6()
         test_data = (test_data,)
-        self._test_conv_combo_u55_BI_pipeline(model, test_data)
+        self._test_conv_combo_ethos_BI_pipeline(
+            model, common.get_u55_compile_spec(permute_memory_to_nhwc=True), test_data
+        )
+
+    @parameterized.expand(ComboConvRelu6.test_data)
+    def test_conv_relu6_u85_BI(self, test_data: torch.Tensor):
+        model = ComboConvRelu6()
+        test_data = (test_data,)
+        self._test_conv_combo_ethos_BI_pipeline(
+            model, common.get_u85_compile_spec(permute_memory_to_nhwc=True), test_data
+        )
 
     ###############################
     ## Block bottleneck residual ##
@@ -281,4 +317,16 @@ def test_block_bottleneck_residual_tosa_BI(self):
 
     def test_block_bottleneck_residual_u55_BI(self):
         model = ComboBlockBottleneckResidual()
-        self._test_conv_combo_u55_BI_pipeline(model, model.get_inputs())
+        self._test_conv_combo_ethos_BI_pipeline(
+            model,
+            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            model.get_inputs(),
+        )
+
+    def test_block_bottleneck_residual_u85_BI(self):
+        model = ComboBlockBottleneckResidual()
+        self._test_conv_combo_ethos_BI_pipeline(
+            model,
+            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            model.get_inputs(),
+        )
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 9b3f79e6a11..11b9e4876bb 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -16,6 +16,7 @@
 from executorch.backends.arm.test.ops.test_conv import Conv2d
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -172,14 +173,17 @@ def _test_dw_conv2d_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_dw_conv2d_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_dw_conv2d_ethos_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.Tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -191,16 +195,35 @@ def _test_dw_conv2d_u55_BI_pipeline(
         )
 
     @parameterized.expand(testsuite)
-    def test_dw_conv2d_tosa_MI(self, test_name, model):
+    def test_dw_conv2d_tosa_MI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv2d_tosa_MI_pipeline(model, model.get_inputs())
 
     # TODO: Investigate flakyness (MLTORCH-307)
     @parameterized.expand(testsuite)
     @pytest.mark.flaky(reruns=3)
-    def test_dw_conv2d_tosa_BI(self, test_name, model):
+    def test_dw_conv2d_tosa_BI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv2d_tosa_BI_pipeline(model, model.get_inputs())
 
     @parameterized.expand(testsuite_u55, skip_on_empty=True)
-    @unittest.expectedFailure
-    def test_dw_conv2d_u55_BI(self, test_name, model):
-        self._test_dw_conv2d_u55_BI_pipeline(model, model.get_inputs())
+    def test_dw_conv2d_u55_BI(
+        self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
+    ):
+        self._test_dw_conv2d_ethos_BI_pipeline(
+            model,
+            common.get_u55_compile_spec(
+                permute_memory_to_nhwc=True, quantize_io=set_quantize_io
+            ),
+            model.get_inputs(),
+        )
+
+    @parameterized.expand(testsuite)
+    def test_dw_conv2d_u85_BI(
+        self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
+    ):
+        self._test_dw_conv2d_ethos_BI_pipeline(
+            model,
+            common.get_u85_compile_spec(
+                permute_memory_to_nhwc=True, quantize_io=set_quantize_io
+            ),
+            model.get_inputs(),
+        )
diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py
new file mode 100644
index 00000000000..6e85d8fe49b
--- /dev/null
+++ b/backends/arm/test/ops/test_exp.py
@@ -0,0 +1,120 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
+from parameterized import parameterized
+
+test_data_suite = [
+    # (test_name, test_data)
+    ("zeros", torch.zeros(1, 10, 10, 10)),
+    ("ones", torch.ones(10, 10, 10)),
+    ("rand", torch.rand(10, 10) - 0.5),
+    ("randn_pos", torch.randn(10) + 10),
+    ("randn_neg", torch.randn(10) - 10),
+    ("ramp", torch.arange(-16, 16, 0.2)),
+]
+
+
+class TestExp(unittest.TestCase):
+    """Tests lowering of aten.exp"""
+
+    class Exp(torch.nn.Module):
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            return torch.exp(x)
+
+    def _test_exp_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .check(["torch.ops.aten.exp.default"])
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_exp_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .check(["torch.ops.aten.exp.default"])
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_exp_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_u55_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.exp.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_exp_tosa_MI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+    ):
+        self._test_exp_tosa_MI_pipeline(self.Exp(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    def test_exp_tosa_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_exp_tosa_BI_pipeline(self.Exp(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    def test_exp_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_exp_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Exp(), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_exp_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_exp_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Exp(), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py
index 66c081a544c..e9bbea9a5e5 100644
--- a/backends/arm/test/ops/test_expand.py
+++ b/backends/arm/test/ops/test_expand.py
@@ -76,7 +76,9 @@ def _test_expand_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tupl
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_expand_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+    def _test_expand_ethosu_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple
+    ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
             ArmTester(
@@ -104,6 +106,15 @@ def test_expand_tosa_BI(self, test_input, multiples):
 
     # Expected failure since tosa.TILE is unsupported by Vela.
     @parameterized.expand(Expand.test_parameters)
-    @unittest.expectedFailure
+    @unittest.expectedFailure  # TODO: MLBEDSW-9386
     def test_expand_u55_BI(self, test_input, multiples):
-        self._test_expand_tosa_u55_pipeline(self.Expand(), (test_input, multiples))
+        self._test_expand_ethosu_BI_pipeline(
+            self.Expand(), common.get_u55_compile_spec(), (test_input, multiples)
+        )
+
+    @parameterized.expand(Expand.test_parameters)
+    @unittest.expectedFailure  # TODO: MLBEDSW-9386
+    def test_expand_u85_BI(self, test_input, multiples):
+        self._test_expand_ethosu_BI_pipeline(
+            self.Expand(), common.get_u85_compile_spec(), (test_input, multiples)
+        )
diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py
index 1be7f59ab8f..2722edef328 100644
--- a/backends/arm/test/ops/test_full.py
+++ b/backends/arm/test/ops/test_full.py
@@ -15,6 +15,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -93,13 +94,11 @@ def _test_full_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+    def _test_full_tosa_ethos_pipeline(
+        self, compile_spec: list[CompileSpec], module: torch.nn.Module, test_data: Tuple
+    ):
         (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
-            )
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize()
             .export()
             .check_count({"torch.ops.aten.full.default": 1})
@@ -110,6 +109,16 @@ def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple
             .to_executorch()
         )
 
+    def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+        self._test_full_tosa_ethos_pipeline(
+            common.get_u55_compile_spec(), module, test_data
+        )
+
+    def _test_full_tosa_u85_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+        self._test_full_tosa_ethos_pipeline(
+            common.get_u85_compile_spec(), module, test_data
+        )
+
     def test_only_full_tosa_MI(self):
         self._test_full_tosa_MI_pipeline(self.Full(), ())
 
@@ -138,6 +147,13 @@ def test_full_u55_BI(self, test_tensor: Tuple):
             test_tensor,
         )
 
+    @parameterized.expand(AddVariableFull.test_parameters)
+    def test_full_u85_BI(self, test_tensor: Tuple):
+        self._test_full_tosa_u85_pipeline(
+            self.AddVariableFull(),
+            test_tensor,
+        )
+
     # This fails since full outputs int64 by default if 'fill_value' is integer, which our backend doesn't support.
     @unittest.expectedFailure
     def test_integer_value(self):
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index 33f62955ecd..3f68ab0251a 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -15,6 +15,7 @@
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir import EdgeCompileConfig
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -26,17 +27,17 @@
     (
         "model_linear_rank1_zeros",
         torch.zeros(10),
-        10,
+        15,
     ),
     (
         "model_linear_rank1_ones",
         torch.ones(10),
-        10,
+        15,
     ),
     (
         "model_linear_rank1_negative_ones",
         torch.ones(10) * (-1),
-        10,
+        20,
     ),
     (
         "model_linear_rank1_rand",
@@ -46,12 +47,12 @@
     (
         "model_linear_rank1_negative_large_rand",
         torch.rand(10) * (-100),
-        10,
+        30,
     ),
     (
         "model_linear_rank1_large_randn",
-        torch.randn(10) * 100,
-        10,
+        torch.randn(15) * 100,
+        20,
     ),
 ]
 
@@ -153,14 +154,17 @@ def _test_linear_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=True)
         )
 
-    def _test_linear_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
+    def _test_linear_tosa_ethosu_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.Tensor],
+    ) -> ArmTester:
         tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=False),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -172,9 +176,7 @@ def _test_linear_tosa_u55_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-
-        if common.is_option_enabled("corstone300"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+        return tester
 
     @parameterized.expand(test_data_suite_rank1 + test_data_suite_rank4)
     def test_linear_tosa_MI(
@@ -215,10 +217,32 @@ def test_linear_tosa_u55_BI(
     ):
         in_features = test_data.shape[-1]
         test_data = (test_data,)
-        self._test_linear_tosa_u55_BI_pipeline(
+        tester = self._test_linear_tosa_ethosu_BI_pipeline(
+            self.Linear(
+                in_features=in_features,
+                out_features=out_features,
+            ),
+            common.get_u55_compile_spec(permute_memory_to_nhwc=False),
+            test_data,
+        )
+
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+
+    @parameterized.expand(test_data_suite_rank1)
+    def test_linear_tosa_u85_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        out_features: int,
+    ):
+        in_features = test_data.shape[-1]
+        test_data = (test_data,)
+        self._test_linear_tosa_ethosu_BI_pipeline(
             self.Linear(
                 in_features=in_features,
                 out_features=out_features,
             ),
+            common.get_u85_compile_spec(permute_memory_to_nhwc=False),
             test_data,
         )
diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py
new file mode 100644
index 00000000000..269b7be25f5
--- /dev/null
+++ b/backends/arm/test/ops/test_log.py
@@ -0,0 +1,120 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
+from parameterized import parameterized
+
+test_data_suite = [
+    # (test_name, test_data)
+    ("ones_rank4", torch.ones(1, 10, 10, 10)),
+    ("ones_rank3", torch.ones(10, 10, 10)),
+    ("rand", torch.rand(10, 10) + 0.001),
+    ("randn_pos", torch.randn(10) + 10),
+    ("randn_spread", torch.max(torch.Tensor([0.0]), torch.randn(10) * 100)),
+    ("ramp", torch.arange(0.01, 20, 0.2)),
+]
+
+
+class TestLog(unittest.TestCase):
+    """Tests lowering of aten.log"""
+
+    class Log(torch.nn.Module):
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            return torch.log(x)
+
+    def _test_log_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .check(["torch.ops.aten.log.default"])
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_log_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .check(["torch.ops.aten.log.default"])
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_log_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.log.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_log_tosa_MI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+    ):
+        self._test_log_tosa_MI_pipeline(self.Log(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    def test_log_tosa_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_log_tosa_BI_pipeline(self.Log(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    def test_log_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_log_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Log(), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_log_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_log_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Log(), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index e0db958f743..0653e84e704 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -13,6 +13,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -91,14 +92,17 @@ def _test_meandim_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_meandim_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    def _test_meandim_tosa_ethosu_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -106,7 +110,12 @@ def _test_meandim_tosa_u55_BI_pipeline(
             .check(["torch.ops.quantized_decomposed"])
             .to_edge()
             .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
+            .check_not(
+                [
+                    "executorch_exir_dialects_edge__ops_aten_mean_dim",
+                    "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default",
+                ]
+            )
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
         )
@@ -133,4 +142,20 @@ def test_meandim_tosa_u55_BI(
         test_name: str,
         test_data: torch.Tensor,
     ):
-        self._test_meandim_tosa_u55_BI_pipeline(self.MeanDim(), (test_data,))
+        self._test_meandim_tosa_ethosu_BI_pipeline(
+            self.MeanDim(),
+            common.get_u55_compile_spec(),
+            (test_data,),
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_meandim_tosa_u85_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+    ):
+        self._test_meandim_tosa_ethosu_BI_pipeline(
+            self.MeanDim(),
+            common.get_u85_compile_spec(),
+            (test_data,),
+        )
diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py
index 9a9b3ef579b..4271496eaa9 100644
--- a/backends/arm/test/ops/test_mm.py
+++ b/backends/arm/test/ops/test_mm.py
@@ -12,6 +12,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -87,14 +88,17 @@ def _test_mm_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_mm_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_mm_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -131,11 +135,29 @@ def test_mm_single_input_tosa_BI(self, operand1: torch.Tensor):
     @unittest.expectedFailure
     def test_mm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
-        self._test_mm_u55_BI_pipeline(self.MM(), test_data)
+        self._test_mm_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.MM(), test_data
+        )
 
     # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
     @parameterized.expand(MMSingleInput.test_parameters)
     @unittest.expectedFailure
     def test_mm_single_input_u55_BI(self, operand1: torch.Tensor):
         test_data = (operand1,)
-        self._test_mm_u55_BI_pipeline(self.MMSingleInput(), test_data)
+        self._test_mm_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.MMSingleInput(), test_data
+        )
+
+    @parameterized.expand(MM.test_parameters)
+    def test_mm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_mm_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.MM(), test_data
+        )
+
+    @parameterized.expand(MMSingleInput.test_parameters)
+    def test_mm_single_input_u85_BI(self, operand1: torch.Tensor):
+        test_data = (operand1,)
+        self._test_mm_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.MMSingleInput(), test_data
+        )
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index dee8b62f1b2..a1c2dba5fed 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -10,6 +10,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 test_data_sute = [
@@ -101,14 +102,17 @@ def _test_mul_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1.0)
         )
 
-    def _test_mul_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor]
+    def _test_mul_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: tuple[torch.Tensor, torch.Tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -141,9 +145,7 @@ def test_mul_tosa_BI(
         test_data = (input_, other_)
         self._test_mul_tosa_BI_pipeline(self.Mul(), test_data)
 
-    # Expected to fail since RESCALE cannot be fused with MUL in Vela.
     @parameterized.expand(test_data_sute)
-    @unittest.expectedFailure
     def test_mul_u55_BI(
         self,
         test_name: str,
@@ -151,4 +153,18 @@ def test_mul_u55_BI(
         other_: torch.Tensor,
     ):
         test_data = (input_, other_)
-        self._test_mul_u55_BI_pipeline(self.Mul(), test_data)
+        self._test_mul_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Mul(), test_data
+        )
+
+    @parameterized.expand(test_data_sute)
+    def test_mul_u85_BI(
+        self,
+        test_name: str,
+        input_: torch.Tensor,
+        other_: torch.Tensor,
+    ):
+        test_data = (input_, other_)
+        self._test_mul_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Mul(), test_data
+        )
diff --git a/backends/arm/test/ops/test_relu.py b/backends/arm/test/ops/test_relu.py
new file mode 100644
index 00000000000..effbccc74d5
--- /dev/null
+++ b/backends/arm/test/ops/test_relu.py
@@ -0,0 +1,132 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    ArmQuantizer,
+    get_symmetric_quantization_config,
+)
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.backend_details import CompileSpec
+from parameterized import parameterized
+
+
+test_data_suite = [
+    # (test_name, test_data)
+    ("zeros", torch.zeros(1, 10, 10, 10)),
+    ("ones", torch.ones(10, 10, 10)),
+    ("rand", torch.rand(10, 10) - 0.5),
+    ("randn_pos", torch.randn(10) + 10),
+    ("randn_neg", torch.randn(10) - 10),
+    ("ramp", torch.arange(-16, 16, 0.2)),
+]
+
+
+class TestRelu(unittest.TestCase):
+    class Relu(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.relu = torch.nn.ReLU()
+
+        def forward(self, x):
+            return self.relu(x)
+
+    def _test_relu_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .check(["torch.ops.aten.relu.default"])
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_relu_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.relu.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_relu_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
+    ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.relu.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_relu_tosa_MI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+    ):
+        self._test_relu_tosa_MI_pipeline(self.Relu(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    def test_relu_tosa_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_relu_tosa_BI_pipeline(self.Relu(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    def test_relu_u55_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_relu_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Relu(), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_relu_u85_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_relu_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Relu(), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_repeat.py b/backends/arm/test/ops/test_repeat.py
index a6fad033456..542f0d6256b 100644
--- a/backends/arm/test/ops/test_repeat.py
+++ b/backends/arm/test/ops/test_repeat.py
@@ -21,6 +21,7 @@
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 
@@ -77,13 +78,15 @@ def _test_repeat_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tupl
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_repeat_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+    def _test_repeat_ethosu_pipeline(
+        self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: Tuple
+    ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
@@ -103,8 +106,16 @@ def test_repeat_tosa_MI(self, test_input, multiples):
     def test_repeat_tosa_BI(self, test_input, multiples):
         self._test_repeat_tosa_BI_pipeline(self.Repeat(), (test_input, multiples))
 
-    # Expected failure since tosa.TILE is unsupported by Vela.
     @parameterized.expand(Repeat.test_parameters)
-    @unittest.expectedFailure
+    @unittest.expectedFailure  # TODO: MLBEDSW-9386
     def test_repeat_u55_BI(self, test_input, multiples):
-        self._test_repeat_tosa_u55_pipeline(self.Repeat(), (test_input, multiples))
+        self._test_repeat_ethosu_pipeline(
+            common.get_u55_compile_spec(), self.Repeat(), (test_input, multiples)
+        )
+
+    @parameterized.expand(Repeat.test_parameters)
+    @unittest.expectedFailure  # TODO: MLBEDSW-9386
+    def test_repeat_u85_BI(self, test_input, multiples):
+        self._test_repeat_ethosu_pipeline(
+            common.get_u85_compile_spec(), self.Repeat(), (test_input, multiples)
+        )
diff --git a/backends/arm/test/ops/test_sigmoid.py b/backends/arm/test/ops/test_sigmoid.py
index 7a0435689f4..f75583164c1 100644
--- a/backends/arm/test/ops/test_sigmoid.py
+++ b/backends/arm/test/ops/test_sigmoid.py
@@ -13,6 +13,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -102,14 +103,17 @@ def _test_sigmoid_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tup
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_sigmoid_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    def _test_sigmoid_tosa_ethos_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -122,6 +126,20 @@ def _test_sigmoid_tosa_u55_BI_pipeline(
             .to_executorch()
         )
 
+    def _test_sigmoid_tosa_u55_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        self._test_sigmoid_tosa_ethos_BI_pipeline(
+            common.get_u55_compile_spec(), module, test_data
+        )
+
+    def _test_sigmoid_tosa_u85_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        self._test_sigmoid_tosa_ethos_BI_pipeline(
+            common.get_u85_compile_spec(), module, test_data
+        )
+
     @parameterized.expand(test_data_suite)
     def test_sigmoid_tosa_MI(
         self,
@@ -145,8 +163,10 @@ def test_sigmoid_add_sigmoid_tosa_BI(self):
             self.SigmoidAddSigmoid(), (test_data_suite[4][1], test_data_suite[3][1])
         )
 
-    # Fails due to Vela diff from Tosa spec, expected to work with Regor.
     @parameterized.expand(test_data_suite)
-    @unittest.expectedFailure
     def test_sigmoid_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
         self._test_sigmoid_tosa_u55_BI_pipeline(self.Sigmoid(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    def test_sigmoid_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_sigmoid_tosa_u85_BI_pipeline(self.Sigmoid(), (test_data,))
diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py
index 14874df156e..ca026c7f420 100644
--- a/backends/arm/test/ops/test_slice.py
+++ b/backends/arm/test/ops/test_slice.py
@@ -15,6 +15,7 @@
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -77,8 +78,11 @@ def _test_slice_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_slice_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_slice_ethos_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
@@ -96,6 +100,20 @@ def _test_slice_u55_BI_pipeline(
             .to_executorch()
         )
 
+    def _test_slice_u55_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        self._test_slice_ethos_BI_pipeline(
+            common.get_u55_compile_spec(), module, test_data
+        )
+
+    def _test_slice_u85_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        self._test_slice_ethos_BI_pipeline(
+            common.get_u85_compile_spec(), module, test_data
+        )
+
     @parameterized.expand(Slice.test_tensors)
     def test_slice_tosa_MI(self, tensor):
         self._test_slice_tosa_MI_pipeline(self.Slice(), (tensor,))
@@ -108,9 +126,10 @@ def test_slice_nchw_tosa_BI(self, test_tensor: torch.Tensor):
     def test_slice_nhwc_tosa_BI(self, test_tensor: torch.Tensor):
         self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,), True)
 
-    # Fails during Vela compilation when trying to use a Tuple as a Named tuple,
-    # Could be Vela Issue, wait until Regor.
     @parameterized.expand(Slice.test_tensors)
-    @unittest.expectedFailure
     def test_slice_u55_BI(self, test_tensor: torch.Tensor):
         self._test_slice_u55_BI_pipeline(self.Slice(), (test_tensor,))
+
+    @parameterized.expand(Slice.test_tensors)
+    def test_slice_u85_BI(self, test_tensor: torch.Tensor):
+        self._test_slice_u85_BI_pipeline(self.Slice(), (test_tensor,))
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index b3b6230daa7..a7d25d266de 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import unittest
 
 from typing import Tuple
@@ -13,17 +12,20 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
 
 test_data_suite = [
     # (test_name, test_data, dim)
-    ("zeros", torch.zeros(10, 10, 10, 10), 1),
+    ("zeros", torch.zeros(10, 10, 10, 10), 0),
+    ("zeros_neg_dim", torch.zeros(10, 10, 10, 10), -4),
     ("ones", torch.ones(10, 10, 10, 10), 1),
+    ("ones_neg_dim", torch.ones(10, 10, 10, 10), -1),
     ("rand", torch.rand(10, 10, 10, 10), 2),
+    ("rand_neg_dim", torch.rand(10, 10, 10, 10), -2),
     ("randn", torch.randn(10, 10, 10, 10), 3),
+    ("randn_neg_dim", torch.randn(10, 10, 10, 10), -3),
 ]
 
 
@@ -79,14 +81,17 @@ def _test_softmax_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_softmax_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    def _test_softmax_tosa_ethos_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -99,6 +104,20 @@ def _test_softmax_tosa_u55_BI_pipeline(
             .to_executorch()
         )
 
+    def _test_softmax_tosa_u55_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        self._test_softmax_tosa_ethos_BI_pipeline(
+            common.get_u55_compile_spec(), module, test_data
+        )
+
+    def _test_softmax_tosa_u85_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        self._test_softmax_tosa_ethos_BI_pipeline(
+            common.get_u85_compile_spec(), module, test_data
+        )
+
     @parameterized.expand(test_data_suite)
     def test_softmax_tosa_MI(
         self,
@@ -131,3 +150,13 @@ def test_softmax_tosa_u55_BI(
         dim: int,
     ):
         self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    @unittest.expectedFailure
+    def test_softmax_tosa_u85_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        dim: int,
+    ):
+        self._test_softmax_tosa_u85_BI_pipeline(self.Softmax(dim=dim), (test_data,))
diff --git a/backends/arm/test/ops/test_split.py b/backends/arm/test/ops/test_split.py
index bc998179c0c..02133d4e7f4 100644
--- a/backends/arm/test/ops/test_split.py
+++ b/backends/arm/test/ops/test_split.py
@@ -14,6 +14,7 @@
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 test_data_t = tuple[torch.Tensor, int | list[int], int]
@@ -94,15 +95,15 @@ def _test_split_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_split_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: test_data_t
+    def _test_split_ethosu_BI_pipeline(
+        self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: test_data_t
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
@@ -131,9 +132,33 @@ def test_split_n_out_tosa_MI(self, test_data: test_data_t):
     def test_split_tosa_BI(self, test_data: test_data_t):
         self._test_split_tosa_BI_pipeline(self.Split(), test_data)
 
-    # Fails during Vela compilation when trying to use a Tuple as a Named tuple,
-    # Could be Vela Issue, wait until Regor.
-    @parameterized.expand(Split.test_data)
-    @unittest.expectedFailure
+    @parameterized.expand(
+        [Split.test_data[0], Split.test_data[1], Split.test_data[2], Split.test_data[4]]
+    )
     def test_split_u55_BI(self, test_data: test_data_t):
-        self._test_split_u55_BI_pipeline(self.Split(), test_data)
+        self._test_split_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Split(), test_data
+        )
+
+    # TODO MLETORCH-350
+    @parameterized.expand([Split.test_data[3], Split.test_data[5]])
+    @unittest.expectedFailure
+    def test_split_u55_BI_skip(self, test_data: test_data_t):
+        self._test_split_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Split(), test_data
+        )
+
+    @parameterized.expand(
+        [Split.test_data[0], Split.test_data[1], Split.test_data[2], Split.test_data[4]]
+    )
+    def test_split_u85_BI(self, test_data: test_data_t):
+        self._test_split_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Split(), test_data
+        )
+
+    @parameterized.expand([Split.test_data[3], Split.test_data[5]])
+    @unittest.expectedFailure
+    def test_split_u85_BI_skip(self, test_data: test_data_t):
+        self._test_split_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Split(), test_data
+        )
diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py
index 2ae7c3ab36f..e80c0436989 100644
--- a/backends/arm/test/ops/test_sub.py
+++ b/backends/arm/test/ops/test_sub.py
@@ -13,6 +13,7 @@
 from executorch.backends.arm.test import common
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -75,14 +76,17 @@ def _test_sub_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_sub_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_sub_ethosu_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -104,14 +108,40 @@ def test_sub_tosa_BI(self, test_data: torch.Tensor):
         test_data = (test_data,)
         self._test_sub_tosa_BI_pipeline(self.Sub(), test_data)
 
-    # Expected to fail since RESCALE cannot be fused with SUB in Vela.
     @parameterized.expand(Sub.test_parameters)
-    @unittest.expectedFailure
     def test_sub_u55_BI(self, test_data: torch.Tensor):
         test_data = (test_data,)
-        self._test_sub_u55_BI_pipeline(self.Sub(), test_data)
+        self._test_sub_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Sub(), test_data
+        )
+
+    @parameterized.expand(Sub.test_parameters)
+    def test_sub_u85_BI(self, test_data: torch.Tensor):
+        test_data = (test_data,)
+        self._test_sub_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Sub(), test_data
+        )
 
     @parameterized.expand(Sub2.test_parameters)
     def test_sub2_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
         self._test_sub_tosa_MI_pipeline(self.Sub2(), test_data)
+
+    @parameterized.expand(Sub2.test_parameters)
+    def test_sub2_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_sub_tosa_BI_pipeline(self.Sub2(), test_data)
+
+    @parameterized.expand(Sub2.test_parameters)
+    def test_sub2_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_sub_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Sub2(), test_data
+        )
+
+    @parameterized.expand(Sub2.test_parameters)
+    def test_sub2_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_sub_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Sub2(), test_data
+        )
diff --git a/backends/arm/test/ops/test_unsqueeze.py b/backends/arm/test/ops/test_unsqueeze.py
new file mode 100644
index 00000000000..9c79d4371c3
--- /dev/null
+++ b/backends/arm/test/ops/test_unsqueeze.py
@@ -0,0 +1,115 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Tests the unsqueeze op which copies the data of the input tensor (possibly with new data format)
+#
+
+import unittest
+from typing import Sequence, Tuple
+
+import torch
+
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    ArmQuantizer,
+    get_symmetric_quantization_config,
+)
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from parameterized import parameterized
+
+
+class TestSimpleUnsqueeze(unittest.TestCase):
+    class Unsqueeze(torch.nn.Module):
+        shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 5), (5, 5, 5)]
+        test_parameters: list[tuple[torch.Tensor]] = [(torch.ones(n),) for n in shapes]
+
+        def forward(self, x: torch.Tensor, dim):
+            return x.unsqueeze(dim)
+
+    def _test_unsqueeze_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, int]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .check_count({"torch.ops.aten.unsqueeze.default": 1})
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_unsqueeze_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, int]
+    ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.unsqueeze.default": 1})
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
+        )
+
+    def _test_unsqueeze_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor, int],
+    ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.unsqueeze.default": 1})
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(Unsqueeze.test_parameters)
+    def test_unsqueeze_tosa_MI(self, test_tensor: torch.Tensor):
+        for i in range(-test_tensor.dim() - 1, test_tensor.dim() + 1):
+            self._test_unsqueeze_tosa_MI_pipeline(self.Unsqueeze(), (test_tensor, i))
+
+    @parameterized.expand(Unsqueeze.test_parameters)
+    def test_unsqueeze_tosa_BI(self, test_tensor: torch.Tensor):
+        self._test_unsqueeze_tosa_BI_pipeline(self.Unsqueeze(), (test_tensor, 0))
+
+    @parameterized.expand(Unsqueeze.test_parameters)
+    def test_unsqueeze_u55_BI(self, test_tensor: torch.Tensor):
+        self._test_unsqueeze_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Unsqueeze(), (test_tensor, 0)
+        )
+
+    @parameterized.expand(Unsqueeze.test_parameters)
+    def test_unsqueeze_u85_BI(self, test_tensor: torch.Tensor):
+        self._test_unsqueeze_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Unsqueeze(), (test_tensor, 0)
+        )
diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py
index 1f51261bf7a..53025c0ac08 100644
--- a/backends/arm/test/ops/test_view.py
+++ b/backends/arm/test/ops/test_view.py
@@ -21,6 +21,7 @@
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -73,8 +74,11 @@ def _test_view_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_view_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_view_ethos_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
@@ -92,6 +96,20 @@ def _test_view_u55_BI_pipeline(
             .to_executorch()
         )
 
+    def _test_view_u55_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        self._test_view_ethos_BI_pipeline(
+            common.get_u55_compile_spec(), module, test_data
+        )
+
+    def _test_view_u85_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        self._test_view_ethos_BI_pipeline(
+            common.get_u85_compile_spec(), module, test_data
+        )
+
     @parameterized.expand(View.test_parameters)
     def test_view_tosa_MI(self, test_tensor: torch.Tensor):
         self._test_view_tosa_MI_pipeline(self.View(), (test_tensor,))
@@ -103,3 +121,7 @@ def test_view_tosa_BI(self, test_tensor: torch.Tensor):
     @parameterized.expand(View.test_parameters)
     def test_view_u55_BI(self, test_tensor: torch.Tensor):
         self._test_view_u55_BI_pipeline(self.View(), (test_tensor,))
+
+    @parameterized.expand(View.test_parameters)
+    def test_view_u85_BI(self, test_tensor: torch.Tensor):
+        self._test_view_u85_BI_pipeline(self.View(), (test_tensor,))
diff --git a/backends/arm/test/passes/test_meandim_to_averagepool2d.py b/backends/arm/test/passes/test_meandim_to_averagepool2d.py
new file mode 100644
index 00000000000..1cd63e6e52e
--- /dev/null
+++ b/backends/arm/test/passes/test_meandim_to_averagepool2d.py
@@ -0,0 +1,75 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.arm.passes.meandim_to_averagepool_pass import (
+    ConvertMeanDimToAveragePool,
+)
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from executorch.backends.xnnpack.test.tester.tester import RunPasses
+
+
+class MeanDim(torch.nn.Module):
+    def forward(self, x):
+        return torch.mean(x, dim=[-1, -2], keepdim=True)
+
+    def get_inputs(self):
+        return (torch.rand(1, 1280, 7, 7),)
+
+
+class MeanDim2(torch.nn.Module):
+    def forward(self, x):
+        return torch.mean(x, dim=1)
+
+    def get_inputs(self):
+        return (torch.rand(1, 1280, 7, 7),)
+
+
+class TestMeandimToAveragePool2dPass(unittest.TestCase):
+    """
+    Tests the MeanDimToAveragePool2dPass which converts mean.dim to average_pool2d
+    for the special case where dim is [-1, -2] and keepdim is True.
+    """
+
+    def test_tosa_BI_meandim_to_averagepool(self):
+        module = MeanDim()
+        test_pass_stage = RunPasses([ConvertMeanDimToAveragePool])
+        (
+            ArmTester(
+                module,
+                example_inputs=module.get_inputs(),
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge()
+            .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
+            .run_passes(test_pass_stage)
+            .check(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"])
+        )
+
+    def test_tosa_BI_meandim_no_modification(self):
+        module = MeanDim2()
+        test_pass_stage = RunPasses([ConvertMeanDimToAveragePool])
+        (
+            ArmTester(
+                module,
+                example_inputs=module.get_inputs(),
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge()
+            .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
+            .run_passes(test_pass_stage)
+            .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
+            .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"])
+        )
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index 4e3b447103c..6e8b9b25ede 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -10,6 +10,7 @@
 import subprocess
 import tempfile
 
+from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 
 import numpy as np
@@ -265,9 +266,12 @@ def run_corstone300(
             raise RuntimeError(
                 f"Corstone simulation failed, log: \n {result_stdout}\n{result.stderr.decode()}"
             )
+        elif "E [" in result_stdout:
+            logger.error(result_stdout)
 
         tosa_ref_output = np.fromfile(out_path_with_suffix, dtype=np.float32)
-        tosa_ref_output = torch.from_numpy(tosa_ref_output).reshape(inputs[0].shape)
+        output_shape = self.output_node.args[0][0].meta["val"].shape
+        tosa_ref_output = torch.from_numpy(tosa_ref_output).reshape(output_shape)
         return [tosa_ref_output]
 
     def run_tosa_ref_model(
@@ -275,10 +279,10 @@ def run_tosa_ref_model(
         inputs: Tuple[torch.Tensor],
     ) -> list[torch.Tensor]:
         """
-        Run TOSA reference model using the tosa_refence_model program.
+        Run TOSA reference model using the tosa_reference_model program.
 
         In order to do that we need:
-        1. desc.json, which points to files needed by tosa_refence_model.
+        1. desc.json, which points to files needed by tosa_reference_model.
         2. output.tosa, which is the TOSA buffer that describes the model we're
            trying to run.
 
@@ -287,12 +291,6 @@ def run_tosa_ref_model(
         All these files are saved on disk in self.intermediate_path.
 
         Args:
-            params_input (Tuple[List[str], List[QuantizationParams]]): A tuple
-                containing a list of input node names and a list of their
-                quantization parameters (if model is quantized).
-            param_output (Tuple[str, QuantizationParams]): A tuple containing
-                the output node name and its quantization parameters (if
-                model is quantized).
             inputs (Tuple[torch.Tensor]): The input data to run the TOSA
 
         Returns:
@@ -328,7 +326,18 @@ def run_tosa_ref_model(
             self._has_init_run
         ), "RunnerUtil needs to be initialized using init_run() before running tosa reference."
 
-        desc_file_path = os.path.join(self.intermediate_path, "desc.json")
+        all_desc_file_paths = [
+            str(path) for path in Path(self.intermediate_path).glob("desc*.json")
+        ]
+        assert (
+            all_desc_file_paths
+        ), f"No TOSA description file found in '{self.intermediate_path}'."
+        if len(all_desc_file_paths) != 1:
+            raise NotImplementedError(
+                "Graphs with more than one partition are currently not supported."
+            )
+
+        desc_file_path = all_desc_file_paths[0]
         assert os.path.exists(
             desc_file_path
         ), f"desc_file_path: {desc_file_path} does not exist"
@@ -423,7 +432,7 @@ def save_npy(
     Parameters:
         path: the directory where to save the data.
         data: the data to save.
-        is_quantize: whether to quantize the data before saving it.
+        is_quantized: whether to quantize the data before saving it.
         input_name: the name of the file, without file-ending.
         quant_param: the parameters to use for quantization.
     Returns:
@@ -448,7 +457,7 @@ def save_bytes(
     Parameters:
         path: the directory where to save the data.
         data: the data to save.
-        is_quantize: whether to quantize the data before saving it.
+        is_quantized: whether to quantize the data before saving it.
         input_name: the name of the file, without file-ending.
         quant_param: the parameters to use for quantization.
     Returns:
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 41fc907fdfe..2fe8c07e7d1 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -34,6 +34,7 @@
 from executorch.backends.xnnpack.test.tester import Tester
 from executorch.exir import EdgeCompileConfig
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.lowered_backend_module import LoweredBackendModule
 from torch.fx import Graph
 
 logger = logging.getLogger(__name__)
@@ -44,21 +45,42 @@ class Partition(tester.Partition):
     def dump_artifact(self, path_to_dump: Optional[str]):
         super().dump_artifact(path_to_dump)
 
-        to_print = None
-        for spec in self.graph_module.lowered_module_0.compile_specs:
-            if spec.key == "output_format":
-                if spec.value == b"tosa":
-                    tosa_fb = self.graph_module.lowered_module_0.processed_bytes
+        def get_output_format(lowered_module) -> str | None:
+            for spec in lowered_module.compile_specs:
+                if spec.key == "output_format":
+                    return spec.value.decode()
+            return None
+
+        output = ""
+        for node in self.graph_module.graph.nodes:
+            if node.op == "get_attr" and node.name.startswith("lowered_module_"):
+                lowered_module = getattr(self.graph_module, node.name)
+                assert isinstance(
+                    lowered_module, LoweredBackendModule
+                ), f"Attribute {node.name} must be of type LoweredBackendModule."
+
+                output_format = get_output_format(lowered_module)
+                if output_format == "tosa":
+                    tosa_fb = lowered_module.processed_bytes
                     to_print = dbg_tosa_fb_to_json(tosa_fb)
                     to_print = pformat(to_print, compact=True, indent=1)
-                    to_print = f"\n TOSA deserialized: \n{to_print}"
-                elif spec.value == b"vela":
-                    vela_cmd_stream = self.graph_module.lowered_module_0.processed_bytes
-                    to_print = str(vela_cmd_stream)
-                    to_print = f"\n Vela command stream: \n{to_print}"
-                break
-        assert to_print is not None, "No TOSA nor Vela compile spec found"
-        _dump_str(to_print, path_to_dump)
+                    output += f"\nTOSA deserialized {node.name}: \n{to_print}\n"
+                elif output_format == "vela":
+                    vela_cmd_stream = lowered_module.processed_bytes
+                    output += (
+                        f"\nVela command stream {node.name}: \n{vela_cmd_stream}\n"
+                    )
+                else:
+                    logger.warning(
+                        f"No TOSA nor Vela compile spec found in compile specs of {node.name}."
+                    )
+                    continue
+
+        if not output:
+            logger.warning("No output to print generated from artifact.")
+            return
+
+        _dump_str(output, path_to_dump)
 
 
 class Serialize(tester.Serialize):
@@ -242,16 +264,21 @@ def run_method_and_compare_outputs(
         # Loop inputs and compare reference stage with the compared stage.
         for run_iteration in range(num_runs):
             reference_input = inputs if inputs else next(self.generate_random_inputs())
-            if is_nhwc:
-                test_input = self.transpose_data_format(reference_input, "NHWC")
-            else:
-                test_input = reference_input
 
             # Test parameters can include constants that are used in eager mode but are already set as attributes
             # in TOSA. Therefore, only accept torch.Tensor inputs.
-            test_input = [
-                tensor for tensor in test_input if isinstance(tensor, torch.Tensor)
-            ]
+            test_input: list[torch.Tensor] = []
+            for arg in reference_input:
+                if isinstance(arg, torch.Tensor):
+                    test_input.append(arg)
+                if isinstance(arg, tuple) and isinstance(arg[0], torch.Tensor):
+                    test_input.extend(list(arg))
+
+            if (
+                is_nhwc
+                and test_stage == self.stages[self.stage_name(tester.ToExecutorch)]
+            ):
+                test_input = self.transpose_data_format(test_input, "NHWC")
 
             input_shapes = [
                 generated_input.shape if hasattr(generated_input, "shape") else (1,)
@@ -261,7 +288,10 @@ def run_method_and_compare_outputs(
 
             reference_output = reference_stage.run_artifact(reference_input)
             test_output = tuple(test_stage.run_artifact(test_input))
-            if is_nhwc:
+            if (
+                is_nhwc
+                and test_stage == self.stages[self.stage_name(tester.ToExecutorch)]
+            ):
                 test_output = self.transpose_data_format(test_output, "NCHW")
 
             self._compare_outputs(
diff --git a/backends/arm/tosa_mapping.py b/backends/arm/tosa_mapping.py
index 5749d1e2043..0baf3e2ec1b 100644
--- a/backends/arm/tosa_mapping.py
+++ b/backends/arm/tosa_mapping.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 #
 # PyTorch to Tosa mapping - simple mapping functions and multi-type extraction
 # of key information. These are used by the initial compile stage which captures
diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py
index c0d16d51b25..8a90e432a69 100644
--- a/backends/arm/tosa_quant_utils.py
+++ b/backends/arm/tosa_quant_utils.py
@@ -3,18 +3,21 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 # Utiliy functions for TOSA quantized lowerings
 
 import math
-from typing import NamedTuple
+from typing import NamedTuple, Sequence
 
 import numpy as np
 
 import serializer.tosa_serializer as ts
 import torch.fx
+import tosa.Op as TosaOp
 from executorch.backends.arm.tosa_mapping import map_dtype, TosaArg
 from executorch.exir.dialects._ops import ops as exir_ops
-from serializer.tosa_serializer import TosaOp, TosaSerializerTensor
+from serializer.tosa_serializer import TosaSerializerTensor
 from torch.fx import Node
 
 q_op = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
@@ -65,6 +68,7 @@ def is_quant_node(node: torch.fx.Node):
 
 
 def get_quant_node_dtype(node: torch.fx.Node):
+    # pyre-ignore[16]: Undefined attribute.
     if "tosa" in node.target.__name__:
         return node.meta["val"].dtype
 
@@ -231,7 +235,7 @@ def build_rescale_from_int32(
     rescale_scale,
     is_scale32=True,
     is_double_round=False,
-) -> TosaSerializerTensor:
+) -> None:
     multiplier, shift = compute_multiplier_and_shift(rescale_scale)
     attr_rescale_output = ts.TosaSerializerAttribute()
     attr_rescale_output.RescaleAttribute(
@@ -254,7 +258,7 @@ def build_rescale_from_int32(
 
 
 def rescale_nodes_to_int32(
-    nodes: list[Node], tosa_graph: ts.TosaSerializer
+    nodes: Sequence[Node], tosa_graph: ts.TosaSerializer
 ) -> tuple[list[TosaSerializerTensor], float]:
     """Rescales all 'nodes' to int32, adding suitable RESCALE ops to 'tosa_graph'.
     The scales are adjusted using the smallest scale of all 'nodes'.
diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py
index f84e371279b..cfafac16760 100644
--- a/backends/arm/tosa_utils.py
+++ b/backends/arm/tosa_utils.py
@@ -3,9 +3,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import logging
 import os
-from typing import Any, Dict
+from typing import Any, cast, Dict
 
 import numpy as np
 import serializer.tosa_serializer as ts
@@ -48,10 +50,10 @@ def dbg_node(node):
 
 
 # Output TOSA flatbuffer and test harness file
-def dbg_tosa_dump(tosa_graph, path):
-    filename = "output.tosa"
+def dbg_tosa_dump(tosa_graph: ts.TosaSerializer, path: str, suffix: str = ""):
+    filename = f"output{suffix}.tosa"
 
-    logger.info(f"Emitting debug output to {path}")
+    logger.info(f"Emitting debug output to: {path=}, {suffix=}")
 
     os.makedirs(path, exist_ok=True)
 
@@ -63,7 +65,7 @@ def dbg_tosa_dump(tosa_graph, path):
         f.write(fb)
     assert os.path.exists(filepath_tosa_fb), "Failed to write TOSA flatbuffer"
 
-    filepath_desc_json = os.path.join(path, "desc.json")
+    filepath_desc_json = os.path.join(path, f"desc{suffix}.json")
     with open(filepath_desc_json, "w") as f:
         f.write(js)
     assert os.path.exists(filepath_desc_json), "Failed to write TOSA JSON"
@@ -74,7 +76,7 @@ def dbg_fail(node, tosa_graph, path):
     logger.warn("Internal error due to poorly handled node:")
     dbg_node(node)
     logger.warn(f"Debug output captured in '{path}'.")
-    raise RuntimeError("TOSA Internal Error on node, enable logging for further info")
+    raise RuntimeError("TOSA Internal Error on node, enable logging for further info.")
 
 
 # Helper function to match TOSA's broadcasting rank requirement
@@ -235,7 +237,7 @@ def build_avg_pool_2d_common(
     output_zp = 0
 
     if is_quant_node:
-        input_zp = get_quant_node_args(node.args[0]).zp
+        input_zp = get_quant_node_args(cast(torch.fx.Node, node.args[0])).zp
         output_zp = get_quant_node_args(list(node.users)[0]).zp
 
     attr = ts.TosaSerializerAttribute()
@@ -306,7 +308,9 @@ def process_call_function(
     )
 
     # Visiting each Node
+    # pyre-ignore[16]: Undefined attribute.
     if node.target.__name__ in node_visitors:
+        # pyre-ignore[16]: Undefined attribute.
         node_visitors[node.target.__name__].define_node(
             node,
             tosa_graph,
@@ -319,7 +323,10 @@ def process_call_function(
 
 
 def expand_dims(
-    tosa_graph: ts.TosaSerializer, input_node: TosaArg, dtype: ts.DType, dim: int
+    tosa_graph: ts.TosaSerializer,
+    input_node: TosaArg,
+    dtype: int,
+    dim: int,
 ) -> Any:
     """Inserts TOSA operators into the tosa_graph, that perform the equivalent
     of the expand_dims (a.k.a unsqueeze) operation. A new axis is created at the
diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
index f725655e0d6..d786142f085 100644
--- a/backends/cadence/CMakeLists.txt
+++ b/backends/cadence/CMakeLists.txt
@@ -27,8 +27,8 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(TARGET_DIR reference)
 
 if(EXECUTORCH_NNLIB_OPT)
-set(TARGET_DIR hifi)
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
+  set(TARGET_DIR hifi)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
 endif()
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
@@ -68,7 +68,7 @@ target_include_directories(
 
 target_include_directories(
   cadence_runner PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
-                                    ${_common_include_directories}
+                        ${_common_include_directories}
 )
 
 target_link_libraries(
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
index d077169022a..08093efe317 100644
--- a/backends/cadence/aot/TARGETS
+++ b/backends/cadence/aot/TARGETS
@@ -60,6 +60,17 @@ python_library(
     ],
 )
 
+python_library(
+    name = "ops_registrations",
+    srcs = [
+        "ops_registrations.py",
+    ],
+    deps = [
+        "fbcode//caffe2:torch",
+        "fbcode//executorch/backends/cadence/aot:utils",
+    ],
+)
+
 export_file(name = "functions.yaml")
 
 executorch_generated_lib(
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index 509e254b550..e1494f8d20d 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -18,12 +18,13 @@
     ReplaceLogicalNotBooleanWhereWithWherePass,
     ReplacePT2DequantWithCadenceDequantPass,
     ReplacePT2QuantWithCadenceQuantPass,
+    ReplaceSafeSoftmaxWithSoftmax,
     ReplaceScalarTensorWithFullPass,
     ReplaceSqueezeAndUnsqueezeWithViewPass,
 )
 from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion
 from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer
-from executorch.backends.cadence.aot.utils import model_is_quantized
+from executorch.backends.cadence.aot.utils import model_gm_has_SDPA, model_is_quantized
 from executorch.backends.transforms.decompose_sdpa import (
     DecomposeScaledDotProductAttention,
 )
@@ -57,13 +58,20 @@ def convert_pt2(
     """
 
     # Export with dynamo
-    model_exp = capture_pre_autograd_graph(model, inputs)
+    model_gm = capture_pre_autograd_graph(model, inputs)
 
-    # Decompose SDPA
-    DecomposeScaledDotProductAttention(False)(model_exp)
+    if model_gm_has_SDPA(model_gm):  # pyre-fixme[6]
+        # Decompose SDPA
+        DecomposeScaledDotProductAttention(False)(model_gm)  # pyre-fixme[6]
+
+        # Swap _safe_softmax with _softmax (see https://github.com/pytorch/pytorch/pull/133882
+        # for details).
+        result = ReplaceSafeSoftmaxWithSoftmax()(model_gm)  # pyre-fixme[6]
+        assert result is not None
+        model_gm = result.graph_module
 
     # Prepare
-    prepared_model = prepare_pt2e(model_exp, quantizer)
+    prepared_model = prepare_pt2e(model_gm, quantizer)
 
     # Calibrate
     prepared_model(*inputs)
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index a4d856ebed2..e73de6ab7ce 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -4,12 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-strict
+
 from math import prod
 from typing import Optional, Tuple
 
 import torch
-from executorch.exir.scalar_type import ScalarType
-from torch.library import impl, Library
+from torch.library import Library, register_fake
 
 from .utils import get_conv1d_output_size, get_conv2d_output_size
 
@@ -67,31 +68,31 @@
 m = Library("cadence", "IMPL", "Meta")
 
 
-@impl(m, "quantize_per_tensor")
+@register_fake("cadence::quantize_per_tensor")
 def quantize_per_tensor_meta(
     input: torch.Tensor,
     scale: float,
     zero_point: int,
     quant_min: int,
     quant_max: int,
-    dtype: ScalarType,
-):
+    dtype: torch.dtype,
+) -> torch.Tensor:
     return input.new_empty(input.size(), dtype=dtype)
 
 
-@impl(m, "dequantize_per_tensor")
+@register_fake("cadence::dequantize_per_tensor")
 def dequantize_per_tensor_meta(
     input: torch.Tensor,
     scale: float,
     zero_point: int,
     quant_min: int,
     quant_max: int,
-    dtype: ScalarType,
-):
+    dtype: torch.dtype,
+) -> torch.Tensor:
     return input.new_empty(input.size(), dtype=torch.float)
 
 
-@impl(m, "quantized_linear")
+@register_fake("cadence::quantized_linear")
 def quantized_linear_meta(
     src: torch.Tensor,
     weight: torch.Tensor,
@@ -102,7 +103,7 @@ def quantized_linear_meta(
     out_shift: torch.Tensor,
     out_zero_point: int,
     offset: Optional[torch.Tensor],
-):
+) -> torch.Tensor:
     # src comes in shape [leading_dims, in_dim]
     # weight comes in shape [out_dim, in_dim]
     # output comes in empty with shape [leading_dims, out_dim]
@@ -113,7 +114,7 @@ def quantized_linear_meta(
     return src.new_empty(out_size, dtype=torch.uint8)
 
 
-@impl(m, "quantized_conv")
+@register_fake("cadence::quantized_conv")
 def quantized_conv_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -151,7 +152,7 @@ def quantized_conv_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@impl(m, "quantized_layer_norm")
+@register_fake("cadence::quantized_layer_norm")
 def quantized_layer_norm_meta(
     input: torch.Tensor,
     X_scale: torch.Tensor,
@@ -162,22 +163,22 @@ def quantized_layer_norm_meta(
     eps: float,
     output_scale: float,
     output_zero_point: int,
-):
+) -> torch.Tensor:
     return input.new_empty(input.size(), dtype=torch.uint8)
 
 
-@impl(m, "quantized_relu")
+@register_fake("cadence::quantized_relu")
 def quantized_relu_meta(
     X: torch.Tensor,
     X_zero_point: torch.Tensor,
     out_zero_point: int,
     out_multiplier: torch.Tensor,
     out_shift: torch.Tensor,
-):
+) -> torch.Tensor:
     return X.new_empty(X.size(), dtype=torch.uint8)
 
 
-@impl(m, "quantized_matmul")
+@register_fake("cadence::quantized_matmul")
 def quantized_matmul_meta(
     X: torch.Tensor,
     X_zero_point: int,
diff --git a/backends/cadence/aot/passes.py b/backends/cadence/aot/passes.py
index db419bfb5e1..83ef43d1510 100644
--- a/backends/cadence/aot/passes.py
+++ b/backends/cadence/aot/passes.py
@@ -266,3 +266,29 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         result = SpecPropPass()(graph_module)
         assert result is not None
         return result
+
+
+class ReplaceSafeSoftmaxWithSoftmax(ExportPass):
+    """
+    Replace _safe_softmax with _softmax
+    """
+
+    def call_operator(
+        self,
+        op,  # pyre-ignore
+        args: tuple[Argument, ...],
+        kwargs: dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op != torch.ops.aten._safe_softmax.default:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Add False for the half_to_float argument of softmax
+        softmax_args = list(args) + [False]
+
+        return super().call_operator(
+            torch.ops.aten._softmax.default,
+            tuple(softmax_args),
+            kwargs,
+            meta,
+        )
diff --git a/backends/cadence/aot/quantizer/utils.py b/backends/cadence/aot/quantizer/utils.py
index 2afe5aba32e..0f9c9399780 100644
--- a/backends/cadence/aot/quantizer/utils.py
+++ b/backends/cadence/aot/quantizer/utils.py
@@ -145,7 +145,7 @@ def get_aten_node_target_partitions(
     """
     Args:
         graph: The graph we want to partition
-        wanted_sources: List of orginal_aten ops (OpOverload)
+        wanted_original_aten_op: List of original_aten ops (OpOverload)
 
     Returns:
         Dictionary mapping aten ops that were given to a list of SourcePartitions
diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py
index f0c294260a7..f081036ccc1 100644
--- a/backends/cadence/aot/utils.py
+++ b/backends/cadence/aot/utils.py
@@ -104,11 +104,11 @@ def get_ops_count(graph_module: torch.fx.GraphModule) -> Dict[str, int]:
             ):
                 continue
             # If the op is already present, increment the count
-            if get_edge_overload_packet(node.target).__name__ in freq:
-                freq[get_edge_overload_packet(node.target).__name__] += 1
+            if node.target._name in freq:
+                freq[node.target._name] += 1
             # else, add a new entry
             else:
-                freq[get_edge_overload_packet(node.target).__name__] = 1
+                freq[node.target._name] = 1
     return freq
 
 
@@ -177,3 +177,11 @@ def print_ops_info(
                 tablefmt="outline",
             )
         )
+
+
+def model_gm_has_SDPA(model_gm: torch.fx.GraphModule) -> bool:
+    for node in model_gm.graph.nodes:
+        if node.op == "call_function":
+            if node.target == torch.ops.aten.scaled_dot_product_attention.default:
+                return True
+    return False
diff --git a/backends/cadence/build_cadence_runner.sh b/backends/cadence/build_cadence_runner.sh
index 51f363f8de4..693a320bdf4 100755
--- a/backends/cadence/build_cadence_runner.sh
+++ b/backends/cadence/build_cadence_runner.sh
@@ -23,7 +23,7 @@ main() {
   rm -rf cmake-out
   cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_SDK=ON \
+    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
     -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
     -DPYTHON_EXECUTABLE=python3 \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
diff --git a/backends/cadence/build_cadence_xtensa.sh b/backends/cadence/build_cadence_xtensa.sh
new file mode 100644
index 00000000000..f96436e65d5
--- /dev/null
+++ b/backends/cadence/build_cadence_xtensa.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -euo pipefail
+
+unset CMAKE_PREFIX_PATH
+git submodule sync
+git submodule update --init
+./install_requirements.sh
+
+rm -rf cmake-out
+
+STEPWISE_BUILD=false
+
+if $STEPWISE_BUILD; then
+    echo "Building ExecuTorch"
+    cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake  \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \
+        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+        -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+        -DEXECUTORCH_BUILD_CPUINFO=OFF \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
+        -DEXECUTORCH_USE_DL=OFF \
+        -DEXECUTORCH_BUILD_CADENCE=OFF \
+        -DFLATC_EXECUTABLE="$(which flatc)" \
+        -Bcmake-out .
+
+    echo "Building any Cadence-specific binaries on top"
+    cmake -DBUCK2="$BUCK" \
+        -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
+        -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_HOST_TARGETS=ON \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+        -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+        -DEXECUTORCH_BUILD_CADENCE=ON \
+        -DFLATC_EXECUTABLE="$(which flatc)" \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
+        -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
+        -DEXECUTORCH_USE_DL=OFF \
+        -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \
+        -DPYTHON_EXECUTABLE=python3 \
+        -DEXECUTORCH_NNLIB_OPT=ON \
+        -DEXECUTORCH_BUILD_GFLAGS=ON \
+        -DHAVE_FNMATCH_H=OFF \
+        -Bcmake-out/backends/cadence \
+        backends/cadence
+    cmake --build cmake-out/backends/cadence  -j16
+else
+    echo "Building Cadence toolchain with ExecuTorch packages"
+    cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
+    cmake -DBUCK2="$BUCK" \
+        -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \
+        -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
+        -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_HOST_TARGETS=ON \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+        -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+        -DEXECUTORCH_BUILD_CADENCE=OFF \
+        -DFLATC_EXECUTABLE="$(which flatc)" \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
+        -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
+        -DEXECUTORCH_USE_DL=OFF \
+        -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \
+        -DPYTHON_EXECUTABLE=python3 \
+        -DEXECUTORCH_NNLIB_OPT=ON \
+        -DEXECUTORCH_BUILD_GFLAGS=ON \
+        -DHAVE_FNMATCH_H=OFF \
+        -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \
+        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+        -DEXECUTORCH_BUILD_CPUINFO=OFF \
+        -Bcmake-out
+    cmake --build cmake-out --target install --config Release -j16
+fi
+
+echo "Run simple model to verify cmake build"
+python3 -m examples.portable.scripts.export --model_name="add"
+xt-run --turbo cmake-out/executor_runner  --model_path=add.pte
diff --git a/backends/cadence/cadence_runner/cadence_runner.cpp b/backends/cadence/cadence_runner/cadence_runner.cpp
index d76ba004aae..a269ed5a8e8 100644
--- a/backends/cadence/cadence_runner/cadence_runner.cpp
+++ b/backends/cadence/cadence_runner/cadence_runner.cpp
@@ -22,13 +22,13 @@
 
 #include <gflags/gflags.h>
 
+#include <executorch/devtools/bundled_program/bundled_program.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/sdk/bundled_program/bundled_program.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
 
 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB
 
diff --git a/backends/cadence/cadence_runner/targets.bzl b/backends/cadence/cadence_runner/targets.bzl
index 028ff7ad2ef..b59a98cd75a 100644
--- a/backends/cadence/cadence_runner/targets.bzl
+++ b/backends/cadence/cadence_runner/targets.bzl
@@ -19,12 +19,11 @@ def define_common_targets():
         visibility = ["PUBLIC"],
         deps = [
             "fbsource//arvr/third-party/gflags:gflags",
-            "fbsource//xplat/executorch/kernels/portable:generated_lib",
-            "fbsource//xplat/executorch/runtime/executor:program",
+            "fbsource//xplat/executorch/devtools/etdump:etdump_flatcc",
+            "fbsource//xplat/executorch/devtools/bundled_program:runtime",
             "fbsource//xplat/executorch/extension/data_loader:file_data_loader",
             "fbsource//xplat/executorch/extension/data_loader:buffer_data_loader",
-            "fbsource//xplat/executorch/util:util",
-            "fbsource//xplat/executorch/sdk/etdump:etdump_flatcc",
-            "fbsource//xplat/executorch/sdk/bundled_program:runtime",
+            "fbsource//xplat/executorch/kernels/portable:generated_lib",
+            "fbsource//xplat/executorch/runtime/executor:program",
         ],
     )
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index 8cbeb3e1806..15d1a4ddd52 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -13,11 +13,12 @@ add_library(
 
 target_include_directories(
   cadence_kernels
-  PUBLIC .
-         ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/common/include/
-         ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include/nnlib
-         ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include
-         ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/
+  PUBLIC
+    .
+    ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/common/include/
+    ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include/nnlib
+    ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include
+    ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/
 )
 
 target_link_libraries(cadence_kernels PRIVATE xa_nnlib)
diff --git a/backends/cadence/hifi/kernels/TARGETS b/backends/cadence/hifi/kernels/TARGETS
new file mode 100644
index 00000000000..67f2bab681a
--- /dev/null
+++ b/backends/cadence/hifi/kernels/TARGETS
@@ -0,0 +1,5 @@
+load("targets.bzl", "define_common_targets")
+
+oncall("odai_jarvis")
+
+define_common_targets()
diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp
index 5a2d58d2e2f..4d9183e4cc2 100644
--- a/backends/cadence/hifi/kernels/kernels.cpp
+++ b/backends/cadence/hifi/kernels/kernels.cpp
@@ -6,9 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/reference/kernels/kernels.h>
-#include "xa_nnlib_common.h"
-#include "xa_nnlib_common_macros.h"
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <xa_nnlib_common.h>
+#include <xa_nnlib_common_macros.h>
 
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 789c8942a85..b5659824615 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -8,12 +8,9 @@
 
 #pragma once
 
-#include "inttypes.h"
-#include "stddef.h"
-#include "xa_type_def.h"
-
-/* For NNLIB APIs */
-#include "xa_nnlib_kernels_api.h"
+#include <inttypes.h>
+#include <stddef.h>
+#include <xa_type_def.h>
 
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/kernels/targets.bzl b/backends/cadence/hifi/kernels/targets.bzl
new file mode 100644
index 00000000000..acdc39dd16d
--- /dev/null
+++ b/backends/cadence/hifi/kernels/targets.bzl
@@ -0,0 +1,18 @@
+load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "kernels",
+        srcs = ["kernels.cpp"],
+        exported_headers = [
+            "kernels.h",
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+        ],
+        exported_deps = [
+            "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib_common",
+        ],
+        platforms = CXX,
+    )
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index 996d109db48..8da6169cda1 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -44,7 +44,8 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp")
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
+)
 add_library(aten_ops_cadence ${_aten_ops__srcs})
 target_link_libraries(aten_ops_cadence PUBLIC executorch)
 target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
@@ -52,18 +53,20 @@ target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
-target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/..
-                                                ${CMAKE_BINARY_DIR}
-                                                ${_common_include_directories})
+target_include_directories(
+  aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
+                          ${_common_include_directories}
+)
 
 # Custom ops that are needed to run the test model.
 add_library(
-  custom_ops "quantized_linear_out.cpp"
-  "quantized_layer_norm.cpp"
-  "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp")
-target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/..
-                                             ${CMAKE_BINARY_DIR}
-                                             ${_common_include_directories})
+  custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp"
+             "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp"
+)
+target_include_directories(
+  custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
+                    ${_common_include_directories}
+)
 
 target_link_libraries(custom_ops PUBLIC executorch)
 target_link_libraries(custom_ops PRIVATE cadence_kernels)
@@ -75,12 +78,11 @@ gen_selected_ops(
   "${CMAKE_CURRENT_LIST_DIR}/../../aot/functions_hifi.yaml" "" ""
 )
 generate_bindings_for_kernels(
-  LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML
-  FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions_hifi.yaml
+  LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML FUNCTIONS_YAML
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions_hifi.yaml
 )
 message("Generated files ${gen_command_sources}")
 
 gen_operators_lib(
-  LIB_NAME "cadence_ops_lib"
-  KERNEL_LIBS custom_ops
-  DEPS aten_ops_cadence)
+  LIB_NAME "cadence_ops_lib" KERNEL_LIBS custom_ops DEPS aten_ops_cadence
+)
diff --git a/backends/cadence/hifi/operators/TARGETS b/backends/cadence/hifi/operators/TARGETS
new file mode 100644
index 00000000000..67f2bab681a
--- /dev/null
+++ b/backends/cadence/hifi/operators/TARGETS
@@ -0,0 +1,5 @@
+load("targets.bzl", "define_common_targets")
+
+oncall("odai_jarvis")
+
+define_common_targets()
diff --git a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
index 0067f6510db..79645f5381d 100644
--- a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
+++ b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
@@ -6,19 +6,20 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/reference/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <xa_nnlib_kernels_api.h>
 
 namespace impl {
 namespace HiFi {
 namespace native {
 
 using Tensor = exec_aten::Tensor;
-using RuntimeContext = torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 using ScalarType = exec_aten::ScalarType;
 
 void dequantize_per_tensor_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& input,
     double scale,
     int64_t zero_point,
diff --git a/backends/cadence/hifi/operators/quantize_per_tensor.cpp b/backends/cadence/hifi/operators/quantize_per_tensor.cpp
index bc0d315f3dd..e280f6bcffd 100644
--- a/backends/cadence/hifi/operators/quantize_per_tensor.cpp
+++ b/backends/cadence/hifi/operators/quantize_per_tensor.cpp
@@ -6,21 +6,22 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/reference/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <xa_nnlib_kernels_api.h>
 
 namespace impl {
 namespace HiFi {
 namespace native {
 
 using Tensor = exec_aten::Tensor;
-using RuntimeContext = torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 using ScalarType = exec_aten::ScalarType;
 
 // Quantize the input tensor (PT2 version). Note that quant_<min,max> are not
 // used in any computation.
 void quantize_per_tensor_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& input,
     double scale,
     int64_t zero_point,
diff --git a/backends/cadence/hifi/operators/quantized_layer_norm.cpp b/backends/cadence/hifi/operators/quantized_layer_norm.cpp
index 034e5b28848..3974d6ee5e9 100644
--- a/backends/cadence/hifi/operators/quantized_layer_norm.cpp
+++ b/backends/cadence/hifi/operators/quantized_layer_norm.cpp
@@ -6,15 +6,14 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/reference/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-
 #include <algorithm>
 #include <cmath>
 #include <tuple>
 
 using Tensor = exec_aten::Tensor;
-using RuntimeContext = torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 
 namespace impl {
 namespace HiFi {
@@ -76,9 +75,11 @@ void quantized_layer_norm_(
     for (size_t j = 0; j < last_dim; ++j) {
       // Since X is quantized, we dequantize it, compute fp32 result, and
       // quantize the result to an int8/uint8 value.
-      float val = kernels::dequantize<T>(x[j], input_scale, input_zero_point);
+      float val = impl::HiFi::kernels::dequantize<T>(
+          x[j], input_scale, input_zero_point);
       val = (val - mean) * inv_std * weight_data[j] + bias_data[j];
-      y[j] = kernels::quantize<T>(val, output_inv_scale, output_zero_point);
+      y[j] = impl::HiFi::kernels::quantize<T>(
+          val, output_inv_scale, output_zero_point);
     }
   }
 }
@@ -114,7 +115,7 @@ void quantized_layer_norm_(
 }
 
 void quantized_layer_norm_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& in_scale,
     const Tensor& in_zero_point,
diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp
index ddba4df17c2..fb186abbb14 100644
--- a/backends/cadence/hifi/operators/quantized_linear_out.cpp
+++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp
@@ -6,8 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/reference/kernels/kernels.h>
-
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <algorithm>
 #include <cmath>
@@ -17,10 +16,10 @@ namespace HiFi {
 namespace native {
 
 using Tensor = exec_aten::Tensor;
-using RuntimeContext = torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 
 void quantized_linear_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& src,
     const Tensor& weight,
     const Tensor& bias,
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
new file mode 100644
index 00000000000..c7b24d790f0
--- /dev/null
+++ b/backends/cadence/hifi/operators/targets.bzl
@@ -0,0 +1,30 @@
+load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    # Define build targets for all operators registered in the tables above.
+
+    runtime.cxx_library(
+        name = "cadence_hifi_ops",
+        srcs = glob([
+            "*.cpp",
+        ]),
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:broadcast_util",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib",
+            "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib_common",
+            "//executorch/backends/cadence/hifi/kernels:kernels",
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+        ],
+    )
diff --git a/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt
index e93e0759d2c..90eca6b47e1 100644
--- a/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt
+++ b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt
@@ -1,30 +1,19 @@
-
 cmake_minimum_required(VERSION 3.10.0)
 project(cadence_nnlib)
 
-        
-add_custom_target( nnlib_target ALL COMMAND 
-                    make install_nnlib -f makefile -C ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/build 
-                    OBJDIR=${CMAKE_CURRENT_BINARY_DIR}/obj 
-                    LIBDIR=${CMAKE_CURRENT_BINARY_DIR}/lib 
-                    -j8 )
+add_custom_target(
+  nnlib_target ALL
+  COMMAND
+    make install_nnlib -f makefile -C
+    ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/build
+    OBJDIR=${CMAKE_CURRENT_BINARY_DIR}/obj
+    LIBDIR=${CMAKE_CURRENT_BINARY_DIR}/lib -j8
+)
 
 add_library(xa_nnlib STATIC IMPORTED GLOBAL)
 add_dependencies(xa_nnlib nnlib_target)
 
 set_property(
-  TARGET xa_nnlib
-  PROPERTY 
-  IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/lib/xa_nnlib.a"
+  TARGET xa_nnlib PROPERTY IMPORTED_LOCATION
+                           "${CMAKE_CURRENT_BINARY_DIR}/lib/xa_nnlib.a"
 )
-
-
-
-
-
-
-
-
-
-
-
diff --git a/backends/cadence/reference/kernels/CMakeLists.txt b/backends/cadence/reference/kernels/CMakeLists.txt
index eadb01f54d5..fba66e9b27a 100644
--- a/backends/cadence/reference/kernels/CMakeLists.txt
+++ b/backends/cadence/reference/kernels/CMakeLists.txt
@@ -5,12 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 # lint_cmake: -linelength
-add_library(
-  cadence_kernels
-  kernels.cpp
-)
+add_library(cadence_kernels kernels.cpp)
 
-target_include_directories(
-  cadence_kernels
-  PUBLIC .
-)
+target_include_directories(cadence_kernels PUBLIC .)
diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt
index 71b0304c997..605c43ef715 100644
--- a/backends/cadence/reference/operators/CMakeLists.txt
+++ b/backends/cadence/reference/operators/CMakeLists.txt
@@ -50,7 +50,8 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_expand_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp")
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp"
+)
 add_library(aten_ops_cadence ${_aten_ops__srcs})
 target_link_libraries(aten_ops_cadence PUBLIC executorch)
 target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
@@ -58,19 +59,26 @@ target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
-target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/..
-                                                ${CMAKE_BINARY_DIR}
-                                                ${_common_include_directories})
+target_include_directories(
+  aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
+                          ${_common_include_directories}
+)
 
 # Custom ops that are needed to run the test model.
 add_library(
-  custom_ops "quantized_linear_out.cpp" "quantized_conv_out.cpp"
-  "quantized_relu_out.cpp" "quantized_layer_norm.cpp"
-  "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp"
-  "quantized_matmul_out.cpp")
-target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/..
-                                             ${CMAKE_BINARY_DIR}
-                                             ${_common_include_directories})
+  custom_ops
+  "quantized_linear_out.cpp"
+  "quantized_conv_out.cpp"
+  "quantized_relu_out.cpp"
+  "quantized_layer_norm.cpp"
+  "quantize_per_tensor.cpp"
+  "dequantize_per_tensor.cpp"
+  "quantized_matmul_out.cpp"
+)
+target_include_directories(
+  custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
+                    ${_common_include_directories}
+)
 
 target_link_libraries(custom_ops PUBLIC executorch)
 target_link_libraries(custom_ops PRIVATE cadence_kernels)
@@ -82,12 +90,11 @@ gen_selected_ops(
   "${CMAKE_CURRENT_LIST_DIR}/../../aot/functions.yaml" "" ""
 )
 generate_bindings_for_kernels(
-  LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML
-  FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions.yaml
+  LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML FUNCTIONS_YAML
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions.yaml
 )
 message("Generated cadence x86 files ${gen_command_sources}")
 
 gen_operators_lib(
-  LIB_NAME "cadence_ops_lib"
-  KERNEL_LIBS custom_ops
-  DEPS aten_ops_cadence)
+  LIB_NAME "cadence_ops_lib" KERNEL_LIBS custom_ops DEPS aten_ops_cadence
+)
diff --git a/backends/cadence/reference/operators/dequantize_per_tensor.cpp b/backends/cadence/reference/operators/dequantize_per_tensor.cpp
index 29323ce612f..9c6cf6ecc55 100644
--- a/backends/cadence/reference/operators/dequantize_per_tensor.cpp
+++ b/backends/cadence/reference/operators/dequantize_per_tensor.cpp
@@ -14,11 +14,11 @@ namespace reference {
 namespace native {
 
 using Tensor = exec_aten::Tensor;
-using RuntimeContext = torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 using ScalarType = exec_aten::ScalarType;
 
 void dequantize_per_tensor_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& input,
     double scale,
     int64_t zero_point,
diff --git a/backends/cadence/reference/operators/op_add.cpp b/backends/cadence/reference/operators/op_add.cpp
index 3a8a3887171..89b67467605 100644
--- a/backends/cadence/reference/operators/op_add.cpp
+++ b/backends/cadence/reference/operators/op_add.cpp
@@ -16,7 +16,7 @@ namespace executor {
 namespace native {
 
 Tensor& add_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     const Scalar& alpha,
diff --git a/backends/cadence/reference/operators/op_embedding.cpp b/backends/cadence/reference/operators/op_embedding.cpp
index f0b625c963e..e1e4984b56e 100644
--- a/backends/cadence/reference/operators/op_embedding.cpp
+++ b/backends/cadence/reference/operators/op_embedding.cpp
@@ -13,10 +13,10 @@ namespace executor {
 namespace native {
 
 using Tensor = exec_aten::Tensor;
-using RuntimeContext = torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 
 void embedding_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& weight,
     const Tensor& indices,
     int64_t padding_idx,
diff --git a/backends/cadence/reference/operators/op_full.cpp b/backends/cadence/reference/operators/op_full.cpp
index 75d1d51901a..00be1889651 100644
--- a/backends/cadence/reference/operators/op_full.cpp
+++ b/backends/cadence/reference/operators/op_full.cpp
@@ -17,7 +17,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& full_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const IntArrayRef sizes,
     const Scalar& fill_value,
     Tensor& out) {
diff --git a/backends/cadence/reference/operators/op_view_copy.cpp b/backends/cadence/reference/operators/op_view_copy.cpp
index a363125c375..ac0a8598499 100644
--- a/backends/cadence/reference/operators/op_view_copy.cpp
+++ b/backends/cadence/reference/operators/op_view_copy.cpp
@@ -13,10 +13,10 @@ namespace executor {
 namespace native {
 
 using Tensor = exec_aten::Tensor;
-using RuntimeContext = torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 
 Tensor& view_copy_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     const IntArrayRef size,
     Tensor& out) {
diff --git a/backends/cadence/reference/operators/quantize_per_tensor.cpp b/backends/cadence/reference/operators/quantize_per_tensor.cpp
index c2e53cda885..bc200fd376e 100644
--- a/backends/cadence/reference/operators/quantize_per_tensor.cpp
+++ b/backends/cadence/reference/operators/quantize_per_tensor.cpp
@@ -14,13 +14,13 @@ namespace reference {
 namespace native {
 
 using Tensor = exec_aten::Tensor;
-using RuntimeContext = torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 using ScalarType = exec_aten::ScalarType;
 
 // Quantize the input tensor (PT2 version). Note that quant_<min,max> are not
 // used in any computation.
 void quantize_per_tensor_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& input,
     double scale,
     int64_t zero_point,
diff --git a/backends/cadence/reference/operators/quantized_conv_out.cpp b/backends/cadence/reference/operators/quantized_conv_out.cpp
index 4bb7b12a887..47234a7cd95 100644
--- a/backends/cadence/reference/operators/quantized_conv_out.cpp
+++ b/backends/cadence/reference/operators/quantized_conv_out.cpp
@@ -17,7 +17,7 @@ namespace reference {
 namespace native {
 
 using Tensor = exec_aten::Tensor;
-using RuntimeContext = torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 
 // This implements a generic 2d conv kernel that operates on raw pointers.
 // The version handles both quantized and fp32 convolutions.
@@ -156,7 +156,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
 // quantized::conv1d or quantized::conv2d based on the dimensionality of
 // activation tensor.
 void quantized_conv_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
diff --git a/backends/cadence/reference/operators/quantized_layer_norm.cpp b/backends/cadence/reference/operators/quantized_layer_norm.cpp
index 6588748d2da..a2dd644a976 100644
--- a/backends/cadence/reference/operators/quantized_layer_norm.cpp
+++ b/backends/cadence/reference/operators/quantized_layer_norm.cpp
@@ -14,7 +14,7 @@
 #include <tuple>
 
 using Tensor = exec_aten::Tensor;
-using RuntimeContext = torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 
 namespace impl {
 namespace reference {
@@ -112,7 +112,7 @@ void quantized_layer_norm_(
 }
 
 void quantized_layer_norm_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& in_scale,
     const Tensor& in_zero_point,
diff --git a/backends/cadence/reference/operators/quantized_linear_out.cpp b/backends/cadence/reference/operators/quantized_linear_out.cpp
index 43289b3a28b..300158d8e5e 100644
--- a/backends/cadence/reference/operators/quantized_linear_out.cpp
+++ b/backends/cadence/reference/operators/quantized_linear_out.cpp
@@ -14,10 +14,10 @@ namespace reference {
 namespace native {
 
 using Tensor = exec_aten::Tensor;
-using RuntimeContext = torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 
 void quantized_linear_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& src,
     const Tensor& weight,
     const Tensor& bias,
diff --git a/backends/cadence/reference/operators/quantized_matmul_out.cpp b/backends/cadence/reference/operators/quantized_matmul_out.cpp
index d65175f8f17..b381a8ee394 100644
--- a/backends/cadence/reference/operators/quantized_matmul_out.cpp
+++ b/backends/cadence/reference/operators/quantized_matmul_out.cpp
@@ -14,7 +14,7 @@ namespace reference {
 namespace native {
 
 using Tensor = exec_aten::Tensor;
-using RuntimeContext = torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 
 // The quantized matmul. The quantized matmul accumulates in a wider register,
 // whose type is TA.
@@ -108,7 +108,7 @@ void inline _typed_quantized_matmul(
 }
 
 void quantized_matmul_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& X,
     int64_t X_zero_point,
     const Tensor& Y,
diff --git a/backends/cadence/reference/operators/quantized_relu_out.cpp b/backends/cadence/reference/operators/quantized_relu_out.cpp
index ef1813f65c7..04cb2c88336 100644
--- a/backends/cadence/reference/operators/quantized_relu_out.cpp
+++ b/backends/cadence/reference/operators/quantized_relu_out.cpp
@@ -14,7 +14,7 @@ namespace reference {
 namespace native {
 
 using Tensor = exec_aten::Tensor;
-using RuntimeContext = torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 
 template <typename T>
 void quantized_relu_(
@@ -44,7 +44,7 @@ void quantized_relu_(
 }
 
 void quantized_relu_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& in_zero_point,
     const int64_t out_zero_point,
diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS
index 9f30cadf6fd..1b55a7d541b 100644
--- a/backends/cadence/runtime/TARGETS
+++ b/backends/cadence/runtime/TARGETS
@@ -13,9 +13,9 @@ python_library(
     typing = True,
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/bundled_program/serialize:lib",
         "//executorch/exir:lib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/bundled_program/serialize:lib",
     ],
 )
diff --git a/backends/cadence/runtime/executor.py b/backends/cadence/runtime/executor.py
index 7bcf705c034..d07b1b6a52e 100644
--- a/backends/cadence/runtime/executor.py
+++ b/backends/cadence/runtime/executor.py
@@ -18,14 +18,13 @@
 
 import torch
 
-from executorch.exir import ExecutorchProgram, ExecutorchProgramManager
-
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.core import BundledProgram
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.core import BundledProgram
 
-from executorch.sdk.bundled_program.serialize import (
+from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
+from executorch.exir import ExecutorchProgram, ExecutorchProgramManager
 
 # If quiet is true, suppress the printing of stdout and stderr output.
 quiet = False
diff --git a/backends/cadence/runtime/executor_main.sh b/backends/cadence/runtime/executor_main.sh
index c850ab8b4a9..7d6cba09b87 100644
--- a/backends/cadence/runtime/executor_main.sh
+++ b/backends/cadence/runtime/executor_main.sh
@@ -5,7 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Test the end-to-end flow of building sdk_example_runner and use it to run
+# Test the end-to-end flow of building devtools/example_runner and use it to run
 # an actual model.
 
 
@@ -14,21 +14,21 @@ set -e
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/../../.ci/scripts/utils.sh"
 
-cmake_install_executorch_sdk_lib() {
+cmake_install_executorch_devtools_lib() {
   echo "Installing libexecutorch.a, libportable_kernels.a, libetdump.a, libbundled_program.a"
   rm -rf cmake-out
 
   retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
-          -DEXECUTORCH_BUILD_SDK=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
   cmake --build cmake-out -j9 --target install --config Release
 }
 
-test_cmake_sdk_example_runner() {
-  local example_dir=examples/sdk
+test_cmake_devtools_example_runner() {
+  local example_dir=examples/devtools
   local build_dir=cmake-out/${example_dir}
   CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
   rm -rf ${build_dir}
@@ -42,8 +42,8 @@ test_cmake_sdk_example_runner() {
   echo "Building ${example_dir}"
   cmake --build ${build_dir} -j9 --config Release
 
-  echo 'Running sdk_example_runner'
-  ${build_dir}/sdk_example_runner --bundled_program_path="./CadenceDemoModel.bpte"
+  echo 'Running devtools/example_runner'
+  ${build_dir}/example_runner --bundled_program_path="./CadenceDemoModel.bpte"
 }
 
 if [[ -z $PYTHON_EXECUTABLE ]];
@@ -56,5 +56,5 @@ then
   BUCK=buck2
 fi
 
-cmake_install_executorch_sdk_lib
-test_cmake_sdk_example_runner
+cmake_install_executorch_devtools_lib
+test_cmake_devtools_example_runner
diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/runtime/runtime.py
index ec282f8f7b3..33bb20719c8 100644
--- a/backends/cadence/runtime/runtime.py
+++ b/backends/cadence/runtime/runtime.py
@@ -18,10 +18,10 @@
 
 from executorch.backends.cadence.runtime import utils
 from executorch.backends.cadence.runtime.executor import Executor
+from executorch.devtools import Inspector
 from executorch.exir import ExecutorchProgramManager
 from executorch.exir._serialize._program import deserialize_pte_binary
 from executorch.exir.schema import DataLocation
-from executorch.sdk import Inspector
 
 from numpy import ndarray
 
diff --git a/backends/example/test_example_delegate.py b/backends/example/test_example_delegate.py
index 973b457bade..d830c1bb312 100644
--- a/backends/example/test_example_delegate.py
+++ b/backends/example/test_example_delegate.py
@@ -46,7 +46,7 @@ def get_example_inputs():
         )
 
         m = model.eval()
-        m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs))
+        m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module()
         # print("original model:", m)
         quantizer = ExampleQuantizer()
         # quantizer = XNNPACKQuantizer()
@@ -82,7 +82,7 @@ def test_delegate_mobilenet_v2(self):
         )
 
         m = model.eval()
-        m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs))
+        m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module()
         quantizer = ExampleQuantizer()
 
         m = prepare_pt2e(m, quantizer)
diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt
index c7de8bb1f04..744b1193d5a 100644
--- a/backends/mediatek/CMakeLists.txt
+++ b/backends/mediatek/CMakeLists.txt
@@ -10,40 +10,39 @@
 
 # Let include directory as "executorch/..."
 set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
-set(NEURON_BUFFER_ALLOCATOR_LIB "" CACHE PATH "Path to Neuron Buffer Allocator library")
-message(STATUS "Looking for neuron_buffer_allocator in ${NEURON_BUFFER_ALLOCATOR_LIB}")
-
-include_directories(
-    BEFORE
-    ${_common_include_directories}
+set(NEURON_BUFFER_ALLOCATOR_LIB
+    ""
+    CACHE PATH "Path to Neuron Buffer Allocator library"
+)
+message(
+  STATUS "Looking for neuron_buffer_allocator in ${NEURON_BUFFER_ALLOCATOR_LIB}"
 )
 
+include_directories(BEFORE ${_common_include_directories})
+
 # shortcut include directory for neuron headers
-include_directories(
-    BEFORE
-    ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include
-)
+include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include)
 
 # targets
 add_library(neuron_backend SHARED)
 target_link_libraries(neuron_backend
     PRIVATE
     executorch_no_prim_ops
+    portable_ops_lib
     android
     log
     ${NEURON_BUFFER_ALLOCATOR_LIB}
 )
-target_sources(neuron_backend
-    INTERFACE
-    ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBackend.h
-    ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBufferAllocator.h
-    ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronExecutor.h
-    ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronLog.h
-    ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/APUWareUtilsLib.h
-    ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/NeuronAdapterShim.h
-    PRIVATE
-    ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronBackend.cpp
-    ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronExecutor.cpp
+target_sources(
+  neuron_backend
+  INTERFACE ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBackend.h
+            ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBufferAllocator.h
+            ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronExecutor.h
+            ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronLog.h
+            ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/APUWareUtilsLib.h
+            ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/NeuronAdapterShim.h
+  PRIVATE ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronBackend.cpp
+          ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronExecutor.cpp
 )
 target_link_options_shared_lib(neuron_backend)
 
diff --git a/backends/mediatek/runtime/include/NeuronBackend.h b/backends/mediatek/runtime/include/NeuronBackend.h
index 2cfcb311b93..7a22956de63 100644
--- a/backends/mediatek/runtime/include/NeuronBackend.h
+++ b/backends/mediatek/runtime/include/NeuronBackend.h
@@ -26,7 +26,7 @@
 namespace torch {
 namespace executor {
 
-class NeuronBackend final : public PyTorchBackendInterface {
+class NeuronBackend final : public ::executorch::runtime::BackendInterface {
  public:
   Result<DelegateHandle*> init(
       BackendInitContext& context,
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index babdb96d8bc..a8265df8c7b 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -66,9 +66,7 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release")
   add_link_options("-s")
 
   # --gc-sections is added by torch.
-  add_compile_options(
-    "-O3" "-ffunction-sections" "-fdata-sections" "-frtti"
-  )
+  add_compile_options("-O3" "-ffunction-sections" "-fdata-sections" "-frtti")
 endif()
 
 include_directories(
@@ -183,7 +181,10 @@ target_link_libraries(
 )
 target_link_libraries(
   qnn_executorch_backend PRIVATE qnn_executorch_header qnn_schema qnn_manager
-                                 executorch_no_prim_ops qcir_utils
+                                 executorch_no_prim_ops qcir_utils extension_tensor
+)
+set_target_properties(
+  qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
 )
 target_link_libraries(utils PRIVATE qnn_executorch_logging)
 target_link_libraries(
@@ -245,6 +246,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
             qnn_executorch_header
             executorch
             qcir_utils
+            extension_tensor
   )
   target_link_libraries(
     PyQnnWrapperAdaptor PRIVATE pybind11::module pybind11::lto wrappers
@@ -261,11 +263,8 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
 
   if(CMAKE_BUILD_TYPE STREQUAL "Release")
     # need to allow exceptions in pybind
-    set(_pybind_compile_options
-      -Wno-deprecated-declarations
-      -fPIC
-      -frtti
-      -fexceptions
+    set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti
+                                -fexceptions
     )
     target_compile_options(
       PyQnnManagerAdaptor PUBLIC ${_pybind_compile_options}
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
index d3bf98bae72..79c02e22072 100644
--- a/backends/qualcomm/builders/__init__.py
+++ b/backends/qualcomm/builders/__init__.py
@@ -38,6 +38,7 @@
     op_quantize,
     op_relu,
     op_reshape,
+    op_rms_norm,
     op_rsqrt,
     op_select_copy,
     op_sigmoid,
@@ -92,6 +93,7 @@
     op_quantize,
     op_relu,
     op_reshape,
+    op_rms_norm,
     op_rsqrt,
     op_select_copy,
     op_sigmoid,
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
index e07a745df5f..514bc6efd78 100644
--- a/backends/qualcomm/builders/node_visitor.py
+++ b/backends/qualcomm/builders/node_visitor.py
@@ -202,7 +202,7 @@ def get_quant_tensor_value(
 
         dtype = quant_configs[QCOM_DTYPE]
 
-        tensor = tensor.div(scale + 1e-6).add(zero_point).round().to(dtype)
+        tensor = tensor.div(scale).add(zero_point).round().to(dtype)
         # Make the backends access data correctly
         if quant_configs.get(QCOM_BITWIDTH) == 4:
             mask = torch.full(tensor.size(), 0x0F, dtype=torch.int8)
diff --git a/backends/qualcomm/builders/op_batch_norm.py b/backends/qualcomm/builders/op_batch_norm.py
index 13b24c0d722..9ca299e7432 100644
--- a/backends/qualcomm/builders/op_batch_norm.py
+++ b/backends/qualcomm/builders/op_batch_norm.py
@@ -8,6 +8,11 @@
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 
 import torch
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_QUANT_ATTRS,
+    QCOM_QUANT_MAX,
+    QCOM_SCALE,
+)
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import OpBatchnorm, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -21,6 +26,15 @@ class BatchNorm(NodeVisitor):
     def __init__(self, *args) -> None:
         super().__init__(*args)
 
+    def update_encoding(self, node: torch.fx.Node, tensor: torch.Tensor, eps):
+        if isinstance(tensor, torch._subclasses.FakeTensor):
+            return
+
+        if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
+            # scale value equals to zero will cause failure in HTP
+            diff = max(abs(tensor.max()), abs(tensor.min())) + eps
+            quant_attrs[QCOM_SCALE] = diff / quant_attrs[QCOM_QUANT_MAX]
+
     def define_node(
         self,
         node: torch.fx.Node,
@@ -29,7 +43,7 @@ def define_node(
         input_node = node.args[0]
         input_tensor = self.get_tensor(input_node, node)
 
-        mean_node, var_node, eps = node.args[3], node.args[4], 1e-5
+        mean_node, var_node, eps = node.args[3], node.args[4], 1e-9
         mean_tensor = get_parameter(mean_node, self.edge_program)
         var_tensor = get_parameter(var_node, self.edge_program)
 
@@ -48,6 +62,7 @@ def define_node(
 
         amount = (filter_tensor * mean_tensor) / torch.sqrt(var_tensor + eps)
         bias_tensor = bias_tensor - amount
+        self.update_encoding(bias_node, bias_tensor, eps)
         bias_tensor_wrapper = self.define_tensor(
             bias_node,
             bias_tensor,
@@ -57,6 +72,7 @@ def define_node(
         )
 
         filter_tensor = filter_tensor / torch.sqrt(var_tensor + eps)
+        self.update_encoding(filter_node, filter_tensor, eps)
         filter_tensor_wrapper = self.define_tensor(
             filter_node,
             filter_tensor,
diff --git a/backends/qualcomm/builders/op_conv2d.py b/backends/qualcomm/builders/op_conv2d.py
index 909cc6a21f6..4b58edbac63 100644
--- a/backends/qualcomm/builders/op_conv2d.py
+++ b/backends/qualcomm/builders/op_conv2d.py
@@ -10,16 +10,7 @@
 
 import numpy as np
 import torch
-from executorch.backends.qualcomm.utils.constants import (
-    QCOM_DATA,
-    QCOM_DTYPE,
-    QCOM_QUANT_ATTRS,
-    QCOM_QUANT_MAX,
-    QCOM_QUANT_MIN,
-    QCOM_SCALE,
-    QCOM_ZERO_POINT,
-)
-from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import (
@@ -94,52 +85,6 @@ def _add_conv_op_parameter(
 
         return conv_op
 
-    def _get_bias_tensor(
-        self,
-        node: torch.fx.Node,
-        nodes_to_wrappers: Dict[str, PyQnnWrapper.TensorWrapper],
-        num_output_channel: int,
-    ) -> PyQnnWrapper.PyQnnOpWrapper:
-        # build dummy node if bias is not given
-        bias_node = (
-            node.args[2]
-            if node.args[2] is not None
-            else torch.fx.Node(
-                node.graph,
-                node.name + "_runtime_bias",
-                "call_function",
-                exir_ops.edge.aten.full.default,
-                (),  # args
-                {},  # kwargs
-            )
-        )
-        # zeros tensor to meet HTP constraint if bias is not given
-        bias_tensor = (
-            get_parameter(bias_node, self.edge_program)
-            if node.args[2] is not None
-            else torch.zeros(num_output_channel)
-        )
-        # insert quant attribute to meet HTP constraint if bias is not given
-        if (
-            node.args[2] is None
-            and (bias_quant_attrs := node.meta.get(QCOM_QUANT_ATTRS)) is not None
-        ):
-            quant_attrs = bias_quant_attrs.copy()
-            quant_attrs[QCOM_ZERO_POINT] = 0
-            quant_attrs[QCOM_SCALE] = 0
-            quant_attrs[QCOM_DTYPE] = torch.int32
-            quant_attrs[QCOM_QUANT_MAX] = torch.iinfo(torch.int32).max
-            quant_attrs[QCOM_QUANT_MIN] = torch.iinfo(torch.int32).min + 1
-            bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
-
-        return self.define_tensor(
-            bias_node,
-            bias_tensor,
-            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
-            nodes_to_wrappers,
-            is_input_tensor=False,
-        )
-
     def _define_conv1d(
         self,
         node: torch.fx.Node,
@@ -204,9 +149,17 @@ def _define_conv1d(
             is_input_tensor=False,
         )
         conv_input_tensors = [unsqueeze_output_tensor_wrapper, filter_tensor_wrapper]
-        conv_input_tensors.append(
-            self._get_bias_tensor(node, nodes_to_wrappers, filter_tensor.shape[-1])
-        )
+        if node.args[2] is not None:
+            bias_node = node.args[2]
+            bias_tensor = get_parameter(bias_node, self.edge_program)
+            bias_tensor_wrapper = self.define_tensor(
+                bias_node,
+                bias_tensor,
+                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+                nodes_to_wrappers,
+                is_input_tensor=False,
+            )
+            conv_input_tensors.append(bias_tensor_wrapper)
 
         stride = [1] + cast(List[int], node.args[3])
         padding = [0] + cast(List[int], node.args[4])
@@ -312,9 +265,18 @@ def define_node(
             is_input_tensor=False,
         )
         conv_input_tensors = [input_tensor_wrapper, filter_tensor_wrapper]
-        conv_input_tensors.append(
-            self._get_bias_tensor(node, nodes_to_wrappers, filter_tensor.shape[-1])
-        )
+
+        if node.args[2] is not None:
+            bias_node = node.args[2]
+            bias_tensor = get_parameter(bias_node, self.edge_program)
+            bias_tensor_wrapper = self.define_tensor(
+                bias_node,
+                bias_tensor,
+                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+                nodes_to_wrappers,
+                is_input_tensor=False,
+            )
+            conv_input_tensors.append(bias_tensor_wrapper)
 
         output_tensor = self.get_tensor(node, node)
         output_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_rms_norm.py b/backends/qualcomm/builders/op_rms_norm.py
new file mode 100644
index 00000000000..e99b1f47ba1
--- /dev/null
+++ b/backends/qualcomm/builders/op_rms_norm.py
@@ -0,0 +1,127 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+import numpy as np
+
+import torch
+from executorch.backends.qualcomm.builders.utils import get_parameter
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA, QCOM_QUANT_ATTRS
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpRmsNorm, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class RmsNormVisitor(NodeVisitor):
+    target = ["aten.rms_norm.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        # args of node : ['input', 'normalized_shape', 'weight', 'eps']
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=True,
+        )
+
+        # should be a immutable list
+        normalized_shapes = node.args[1]
+        if (
+            len(normalized_shapes) != 1
+            and normalized_shapes[0] != input_tensor.shape[-1]
+        ):
+            print("Only supports normalization with last input dimension")
+            return
+        axes = [node.args[0].meta["val"].dim() - 1]
+        axes_shape = [len(axes)]
+
+        weight_node = node.args[2]
+        weight_tensor = get_parameter(weight_node, self.edge_program)
+        weight_tensor_wrapper = self.define_tensor(
+            weight_node,
+            weight_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+
+        # Fake node, nn moudle seems to be inconsistant with document
+        bias_tensor = torch.zeros(weight_tensor.shape)
+        bias_node = torch.fx.Node(
+            node.graph,
+            node.name + "_runtime_bias",
+            "call_function",
+            exir_ops.edge.aten.tensor.default,
+            (),  # args
+            {},  # kwargs
+        )
+        if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
+            bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+        bias_tensor_wrapper = self.define_tensor(
+            bias_node,
+            bias_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+
+        epsilon = node.args[3]
+        if isinstance(epsilon, torch.fx.Node):
+            epsilon = get_parameter(epsilon, self.edge_program)
+            epsilon = (
+                epsilon
+                if isinstance(epsilon, float)
+                else torch.finfo(epsilon.dtype).eps
+            )
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+
+        rms_nrom_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpRmsNorm.op_name,
+        )
+
+        rms_nrom_op.AddInputTensors(
+            [input_tensor_wrapper, weight_tensor_wrapper, bias_tensor_wrapper]
+        )
+        rms_nrom_op.AddOutputTensors([output_tensor_wrapper])
+        rms_nrom_op.AddScalarParam(
+            OpRmsNorm.param_epsilon,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
+            {QCOM_DATA: np.float32(epsilon)},
+        )
+        rms_nrom_op.AddTensorParam(
+            OpRmsNorm.param_axes,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(axes_shape),
+            axes_shape,
+            np.array(axes, dtype=np.uint32),
+            True,
+        )
+
+        return rms_nrom_op
diff --git a/backends/qualcomm/builders/op_softmax.py b/backends/qualcomm/builders/op_softmax.py
index ae4c89bbb96..cda40aed458 100644
--- a/backends/qualcomm/builders/op_softmax.py
+++ b/backends/qualcomm/builders/op_softmax.py
@@ -17,7 +17,7 @@
 
 @register_node_visitor
 class Softmax(NodeVisitor):
-    target = ["aten._softmax.default"]
+    target = ["aten._softmax.default", "aten._safe_softmax.default"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
index 4a87e5dbbb3..8ac702f2ad5 100644
--- a/backends/qualcomm/builders/qnn_constants.py
+++ b/backends/qualcomm/builders/qnn_constants.py
@@ -278,6 +278,13 @@ class OpResizeNearestNeighbor:
     param_half_pixel_centers: str = "half_pixel_centers"
 
 
+@dataclass(init=False, frozen=True)
+class OpRmsNorm:
+    op_name: str = "RmsNorm"
+    param_epsilon: str = "epsilon"
+    param_axes: str = "axes"
+
+
 @dataclass(init=False, frozen=True)
 class OpScatterNd:
     op_name: str = "ScatterNd"
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
index 353169bc186..d68441c2f79 100644
--- a/backends/qualcomm/partition/common_defs.py
+++ b/backends/qualcomm/partition/common_defs.py
@@ -17,7 +17,11 @@
 ]
 
 to_be_implemented_operator = [
-    exir_ops.edge.aten.where.default,
+    exir_ops.edge.aten.any.dim,
+    exir_ops.edge.aten.eq.Scalar,
+    exir_ops.edge.aten.full_like.default,
+    exir_ops.edge.aten.logical_not.default,
+    exir_ops.edge.aten.where.self,
 ]
 
 allow_list_operator = [
diff --git a/backends/qualcomm/partition/qnn_partitioner.py b/backends/qualcomm/partition/qnn_partitioner.py
index 73dbede8ff6..659bda517f0 100644
--- a/backends/qualcomm/partition/qnn_partitioner.py
+++ b/backends/qualcomm/partition/qnn_partitioner.py
@@ -44,16 +44,7 @@ def __init__(
     ):
         self.node_visitors = node_visitor.get_node_visitors(edge_program)
 
-        self.skip_node_op_builder_set = set()
-        if skip_node_op_set is not None:
-            self.skip_node_op_builder_set = set(
-                [
-                    self.node_visitors[val]
-                    for val in skip_node_op_set
-                    if val in self.node_visitors
-                ]
-            )
-
+        self.skip_node_op_set = skip_node_op_set
         self.skip_node_id_set = skip_node_id_set
         self.nodes_to_wrappers = defaultdict(dict)
         self.qnn_manager = PyQnnManager.QnnManager(
@@ -75,14 +66,9 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
         if node.target in allow_list_operator:
             return True
 
-        if self.skip_node_id_set is not None and node.name in self.skip_node_id_set:
-            print(f"[QNN Partitioner Op Support]: {node.target.__name__} | Skipped")
-            return False
-
         if (
-            self.skip_node_op_builder_set is not None
-            and self.node_visitors[node.target.__name__]
-            in self.skip_node_op_builder_set
+            node.name in self.skip_node_id_set
+            or node.target.__name__ in self.skip_node_op_set
         ):
             print(f"[QNN Partitioner Op Support]: {node.target.__name__} | Skipped")
             return False
@@ -124,8 +110,8 @@ def __init__(
             QnnBackend.__name__, self.compiler_specs_snapshot
         )
         self.partition_tags: Dict[str, DelegationSpec] = {}
-        self.skip_node_id_set = skip_node_id_set
-        self.skip_node_op_set = skip_node_op_set
+        self.skip_node_id_set = set() if skip_node_id_set is None else skip_node_id_set
+        self.skip_node_op_set = set() if skip_node_op_set is None else skip_node_op_set
 
     def generate_partitions(
         self, edge_program: torch.export.ExportedProgram
diff --git a/backends/qualcomm/passes/annotate_and_quant_scalar.py b/backends/qualcomm/passes/annotate_and_quant_scalar.py
index 5f111ee9c8b..1db50694ece 100644
--- a/backends/qualcomm/passes/annotate_and_quant_scalar.py
+++ b/backends/qualcomm/passes/annotate_and_quant_scalar.py
@@ -14,7 +14,7 @@
 from executorch.exir.passes import dead_code_elimination_pass
 from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
-from .utils import get_quant_attrs
+from .utils import dq_ops, get_quant_attrs
 
 
 class AnnotateAndQuantScalar(ExportPass):
@@ -78,6 +78,7 @@ def _annotate_scalar_node(
             float,
             torch.float32,
             torch.int32,
+            torch.int64,
         ]:
             return
 
@@ -88,30 +89,43 @@ def _traverse_binary_node(self, graph_module: torch.fx.GraphModule):
             graph_module.graph, self.binary_op_sources
         )
         src_partitions = list(itertools.chain(*src_partitions.values()))
+        processed = set()
         for src_partition in src_partitions:
-            output = src_partition.output_nodes[0]
-            if (
-                output.meta.get(QCOM_QUANT_ATTRS)
-                and len(src_partition.input_nodes) == 1
-            ):
-                dq_node = src_partition.input_nodes[0]
-                q_node = dq_node.args[0]
-                q_node_attrs = get_quant_attrs(graph_module, q_node)
-
-                scalar_nodes = [n for n in output.args if n != dq_node]
-                if len(scalar_nodes) == 0:
+            # need post process here to identify partitioned nodes:
+            src_fn_dict = {}
+            for n in src_partition.nodes:
+                # e.g.
+                # meta["source_fn_stack"]: [('mul', <built-in function mul>)]
+                # we'll use <built-in function mul> as grouping key
+                node_list = src_fn_dict.setdefault(n.meta["source_fn_stack"][-1][1], [])
+                node_list.append(n)
+
+            for nodes in src_fn_dict.values():
+                output = [n for n in nodes if n in src_partition.output_nodes][0]
+                # if all args have been annotated, it shouldn't be a scalar operation
+                if all(arg.target in dq_ops for arg in output.args):
                     continue
 
-                scalar_node = scalar_nodes[0]
-                source_scalar_node = self._get_source_scalar_node(scalar_node)
-                # we'll abandon cast op here, since the constant scalar will
-                # be pre-loaded into QNN context binary
-                output.replace_input_with(scalar_node, source_scalar_node)
+                if output not in processed and QCOM_QUANT_ATTRS in output.meta:
+                    dq_node = [n for n in output.args if n.target in dq_ops][0]
+                    q_node = dq_node.args[0]
+                    q_node_attrs = get_quant_attrs(graph_module, q_node)
+
+                    scalar_nodes = [n for n in output.args if n != dq_node]
+                    if len(scalar_nodes) == 0:
+                        continue
+
+                    scalar_node = scalar_nodes[0]
+                    source_scalar_node = self._get_source_scalar_node(scalar_node)
+                    # we'll abandon cast op here, since the constant scalar will
+                    # be pre-loaded into QNN context binary
+                    output.replace_input_with(scalar_node, source_scalar_node)
 
-                scalar_quant_attrs = self._update_scalar_node_attrs(
-                    source_scalar_node, q_node_attrs
-                )
-                self._annotate_scalar_node(source_scalar_node, scalar_quant_attrs)
+                    scalar_quant_attrs = self._update_scalar_node_attrs(
+                        source_scalar_node, q_node_attrs
+                    )
+                    self._annotate_scalar_node(source_scalar_node, scalar_quant_attrs)
+                    processed.add(output)
 
     def call(self, graph_module: torch.fx.GraphModule):
         self._traverse_binary_node(graph_module)
diff --git a/backends/qualcomm/passes/i64_to_i32.py b/backends/qualcomm/passes/i64_to_i32.py
index 7814a3ff0d6..1d2171cc37a 100644
--- a/backends/qualcomm/passes/i64_to_i32.py
+++ b/backends/qualcomm/passes/i64_to_i32.py
@@ -5,7 +5,9 @@
 # LICENSE file in the root directory of this source tree.
 import torch
 from executorch.backends.qualcomm.builders.utils import get_parameter, is_constant
+from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
+from torch._subclasses.fake_tensor import FakeTensor
 
 
 class I64toI32(ExportPass):
@@ -16,6 +18,8 @@ class I64toI32(ExportPass):
     def __init__(self, edge_program: torch.export.ExportedProgram):
         super(I64toI32, self).__init__()
         self.edge_program = edge_program
+        # pyre-ignore[4]
+        self.copy_op = exir_ops.edge.aten._to_copy.default
 
     def _update_meta(self, node: torch.fx.node) -> None:
         meta_val = node.meta["val"]
@@ -32,6 +36,10 @@ def _update_meta(self, node: torch.fx.node) -> None:
             if meta_val.dtype == torch.int64:
                 node.meta["val"] = meta_val.to(torch.float)
 
+    # pyre-ignore[2]
+    def _is_tensor_of_dtype(self, node_val, dtype: torch.dtype) -> bool:
+        return isinstance(node_val, FakeTensor) and node_val.dtype == dtype
+
     def _cast_to_int32(self, graph_module: torch.fx.GraphModule):
         for n in graph_module.graph.nodes:
             if is_constant(n, self.edge_program):
@@ -39,6 +47,22 @@ def _cast_to_int32(self, graph_module: torch.fx.GraphModule):
                 if param.dtype == torch.int64:
                     # QNN does not support int64
                     self._update_meta(n)
+            elif n.op == "placeholder":
+                node_val = n.meta["val"]
+                if self._is_tensor_of_dtype(node_val, torch.int64):
+                    with graph_module.graph.inserting_after(n):
+                        args = (n,)
+                        to_dst_node = graph_module.graph.create_node(
+                            "call_function",
+                            self.copy_op,
+                            args,
+                            {"dtype": torch.int32},
+                        )
+                        to_dst_node.meta["val"] = node_val.to(torch.int32)
+
+                        # Replace usage of the src dtype result with the dst dtype result.
+                        n.replace_all_uses_with(to_dst_node)
+                        to_dst_node.args = (n,)
 
     def call(self, graph_module: torch.fx.GraphModule):
         self._cast_to_int32(graph_module)
diff --git a/backends/qualcomm/passes/recompose_pixel_shuffle.py b/backends/qualcomm/passes/recompose_pixel_shuffle.py
deleted file mode 100644
index 9eec6bfa264..00000000000
--- a/backends/qualcomm/passes/recompose_pixel_shuffle.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc.
-# All rights reserved
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-import torch
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, PassResult
-from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
-
-
-class RecomposePixelShuffle(ExportPass):
-    """
-    Merge decomposed operators back to one super node.
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def call(self, graph_module: torch.fx.GraphModule):
-        graph = graph_module.graph
-        # decomposed core aten ops
-        partitions = get_source_partitions(graph, [torch.nn.PixelShuffle])
-        for _, src_partitions in partitions.items():
-            for src_partition in src_partitions:
-                input_node = src_partition.input_nodes[0]
-                output_node = src_partition.output_nodes[0]
-                with graph.inserting_after(input_node):
-                    h_in_shape = input_node.meta["val"].shape[2]
-                    h_out_shape = output_node.meta["val"].shape[2]
-                    upscale_factor = h_out_shape / h_in_shape
-
-                    pixel_shuffle_node = graph.create_node(
-                        "call_function",
-                        exir_ops.edge.aten.pixel_shuffle.default,
-                        (input_node, int(upscale_factor)),
-                    )
-                    users = output_node.users.copy()
-                    for user in users:
-                        user.replace_input_with(output_node, pixel_shuffle_node)
-                    # copy metadata
-                    pixel_shuffle_node.meta = output_node.meta
-
-        graph.eliminate_dead_code()
-        graph_module.recompile()
-        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/passes/recompose_pixel_unshuffle.py b/backends/qualcomm/passes/recompose_pixel_unshuffle.py
index a47f3d119a5..00d46639089 100644
--- a/backends/qualcomm/passes/recompose_pixel_unshuffle.py
+++ b/backends/qualcomm/passes/recompose_pixel_unshuffle.py
@@ -6,7 +6,6 @@
 import torch
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
-from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
 
 class RecomposePixelUnshuffle(ExportPass):
@@ -85,30 +84,6 @@ def call(self, graph_module: torch.fx.GraphModule):
                     # copy metadata
                     pixel_unshuffle_node.meta = node.meta
 
-        # decomposed core aten ops
-        if not self.quantization_capture:
-            partitions = get_source_partitions(graph, [torch.nn.PixelUnshuffle])
-            for _, src_partitions in partitions.items():
-                for src_partition in src_partitions:
-                    input_node = src_partition.input_nodes[0]
-                    output_node = src_partition.output_nodes[0]
-                    with graph.inserting_after(input_node):
-                        h_in_shape = input_node.meta["val"].shape[2]
-                        h_out_shape = output_node.meta["val"].shape[2]
-                        downscale_factor = h_in_shape / h_out_shape
-
-                        op = self.op
-                        pixel_unshuffle_node = graph.create_node(
-                            "call_function",
-                            op,
-                            (input_node, int(downscale_factor)),
-                        )
-                        users = output_node.users.copy()
-                        for user in users:
-                            user.replace_input_with(output_node, pixel_unshuffle_node)
-                        # copy metadata
-                        pixel_unshuffle_node.meta = output_node.meta
-
         graph.eliminate_dead_code()
         graph_module.recompile()
         return PassResult(graph_module, True)
diff --git a/backends/qualcomm/passes/recompose_rms_norm.py b/backends/qualcomm/passes/recompose_rms_norm.py
new file mode 100644
index 00000000000..b26de8bd794
--- /dev/null
+++ b/backends/qualcomm/passes/recompose_rms_norm.py
@@ -0,0 +1,76 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+
+from .utils import dq_ops
+
+
+class RecomposeRmsNorm(ExportPass):
+    """
+    Merge decomposed operators back to one super node.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def _get_eps_node(self, nodes):
+        # eps: one of inputs of add node
+        add_node = [n for n in nodes if hasattr(n, "name") and "add" in n.name][0]
+        for a in add_node.args:
+            if isinstance(a, float) or a.op != "call_function":
+                return a
+
+    def _get_gamma_node(self, output_node):
+        # gamma: one of inputs of output node
+        for a in output_node.args:
+            if a.op != "call_function" or a.target in dq_ops:
+                return a
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        partitions = get_source_partitions(graph, [torch.nn.RMSNorm])
+        for _, src_partitions in partitions.items():
+            for src_partition in src_partitions:
+                input_len = len(src_partition.input_nodes)
+                if input_len == 1:
+                    input_node = src_partition.input_nodes[0]
+                elif input_len == 2:
+                    inp_0, inp_1 = src_partition.input_nodes
+                    input_node = inp_0 if len(inp_0.users) == 2 else inp_1
+                else:
+                    raise RuntimeError(
+                        f"Found a edge case of rms_node partitoin {src_partition}, which has {input_len} inputs"
+                    )
+
+                output_node = src_partition.output_nodes[0]
+                eps_node = self._get_eps_node(src_partition.nodes)
+                gamma_node = self._get_gamma_node(output_node)
+
+                with graph.inserting_before(output_node):
+                    # args schema
+                    # (Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor
+                    rms_node = graph.create_node(
+                        "call_function",
+                        exir_ops.edge.aten.rms_norm.default,
+                        (
+                            input_node,
+                            list(gamma_node.meta["val"].shape),
+                            gamma_node,
+                            eps_node,
+                        ),
+                    )
+                    users = output_node.users.copy()
+                    for user in users:
+                        user.replace_input_with(output_node, rms_node)
+                    # copy metadata
+                    rms_node.meta = output_node.meta
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/passes/replace_index_put_input.py b/backends/qualcomm/passes/replace_index_put_input.py
new file mode 100644
index 00000000000..1eb210cf67e
--- /dev/null
+++ b/backends/qualcomm/passes/replace_index_put_input.py
@@ -0,0 +1,54 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_ENCODING, QCOM_QUANT_ATTRS
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class ReplaceIndexPutInput(ExportPass):
+    """
+    Index put input workaround for quantized module
+    """
+
+    dq_q_map = {
+        # per tensor
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor: exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
+        # per channel
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+    }
+
+    def __init__(self, edge_program: torch.export.ExportedProgram):
+        super(ReplaceIndexPutInput, self).__init__()
+        self.edge_program = edge_program
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target == exir_ops.edge.aten.index_put.default:
+                if (
+                    copy_node := list(node.users)[0]
+                ) and copy_node.target == exir_ops.edge.aten.copy.default:
+                    m_buffer_node = copy_node.args[0]
+                    bad_frozen_node = node.args[0]
+                    if QCOM_QUANT_ATTRS in bad_frozen_node.meta:
+                        m_buffer_node.meta[QCOM_QUANT_ATTRS] = bad_frozen_node.meta[
+                            QCOM_QUANT_ATTRS
+                        ]
+                        m_buffer_node.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING] = (
+                            self.dq_q_map[
+                                m_buffer_node.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING]
+                            ]
+                        )
+                    with graph.inserting_after(bad_frozen_node):
+                        node.replace_input_with(bad_frozen_node, m_buffer_node)
+                else:
+                    continue
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
index b2c86e50d33..9cde50b9c70 100644
--- a/backends/qualcomm/quantizer/custom_annotation.py
+++ b/backends/qualcomm/quantizer/custom_annotation.py
@@ -91,15 +91,17 @@ def is_edge_condition(node: Node):
     def annotate_matmul_input1(node: Node, quantization_config: QuantizationConfig):
         if is_edge_condition(node):
             return
-        if node.target == torch.ops.aten.index_put_.default:
+        if node.target in [
+            torch.ops.aten.index_put.default,
+            torch.ops.aten.index_put_.default,
+        ]:
             annotate_index_put(node, quantization_config)
             annotate_matmul_input1(node.args[0], quantization_config)
         elif node.target == torch.ops.aten.cat.default:
             annotate_cat(node, quantization_config)
             # Expect that the inputs of the cat op are select ops
-            for arg in node.args[0][1:]:
-                annotate_single_in_single_out(arg, quantization_config)
-            annotate_matmul_input1(node.args[0][0], quantization_config)
+            for arg in node.args[0]:
+                annotate_matmul_input1(arg, quantization_config)
         else:
             annotate_single_in_single_out(node, quantization_config)
             annotate_matmul_input1(node.args[0], quantization_config)
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
index d51e016473f..e27edf939c8 100644
--- a/backends/qualcomm/quantizer/quantizer.py
+++ b/backends/qualcomm/quantizer/quantizer.py
@@ -116,7 +116,7 @@ def _update_per_channel_weight_quant_ops(self, ops: Set[OpOverload], enable: boo
         if enable:
             self.use_per_channel_weight_quant_ops.update(ops)
         else:
-            self.use_per_channel_weight_quant_ops.difference(ops)
+            self.use_per_channel_weight_quant_ops.difference_update(ops)
 
     def add_16bit_quant_ops(self, ops: Set[OpOverload]) -> None:
         for op in ops:
diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py
index d31b4753a3d..d3ae1194acd 100644
--- a/backends/qualcomm/quantizer/utils.py
+++ b/backends/qualcomm/quantizer/utils.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import numbers
+import operator
 from dataclasses import dataclass
 from functools import partial
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple
@@ -77,7 +78,7 @@ def _derive_bias_qparams_fn(
 
 
 def get_default_8bit_qnn_ptq_config(
-    act_symmetric: bool = False, act_observer=MinMaxObserver
+    act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver
 ) -> QuantizationConfig:
     extra_args: Dict[str, Any] = {"eps": 2**-12}
 
@@ -96,7 +97,7 @@ def get_default_8bit_qnn_ptq_config(
         quant_max=torch.iinfo(torch.int8).max,
         qscheme=torch.per_tensor_symmetric,
         ch_axis=0,
-        observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
+        observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args),
     )
 
     bias_quantization_spec = QuantizationSpec(
@@ -104,7 +105,7 @@ def get_default_8bit_qnn_ptq_config(
         quant_min=torch.iinfo(torch.int32).min,
         quant_max=torch.iinfo(torch.int32).max,
         qscheme=torch.per_tensor_symmetric,
-        observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
+        observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args),
     )
 
     quantization_config = QuantizationConfig(
@@ -619,7 +620,13 @@ def annotate_upsample_nearest2d(
     annotate_single_in_single_out(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.softmax.int, torch.ops.aten._softmax.default])
+@register_annotator(
+    [
+        torch.ops.aten.softmax.int,
+        torch.ops.aten._softmax.default,
+        torch.ops.aten._safe_softmax.default,
+    ]
+)
 def annotate_softmax(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
@@ -684,6 +691,31 @@ def annotate_squeeze(node: Node, quantization_config: QuantizationConfig) -> Non
         annotate_single_in_single_out(node, quantization_config)
 
 
+@register_annotator([torch.ops.aten.rms_norm.default])
+def annotate_rms_norm(node: Node, quantization_config: QuantizationConfig) -> None:
+    act_node = node.args[0]
+    weight_node = node.args[2]
+
+    if _is_annotated([node]):
+        return
+
+    # TODO current only support 16a16w
+    _annotate_input_qspec_map(
+        node,
+        act_node,
+        quantization_config.input_activation,
+    )
+
+    _annotate_input_qspec_map(
+        node,
+        weight_node,
+        quantization_config.input_activation,
+    )
+    nodes_to_mark_annotated = [node]
+    _annotate_output_qspec(node, quantization_config.output_activation)
+    _mark_nodes_as_annotated(nodes_to_mark_annotated)
+
+
 @register_annotator([torch.ops.aten.rsqrt.default])
 def annotate_rsqrt(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
@@ -975,6 +1007,38 @@ def annotate_linear(node: Node, quantization_config: QuantizationConfig) -> None
     node.meta["source_fn_stack"] = [(node, torch.nn.Linear)]
 
 
+@register_annotator([torch.ops.aten._native_batch_norm_legit_no_training.default])
+def annotate_batch_norm(node: Node, quantization_config: QuantizationConfig) -> None:
+    act, weight, bias = node.args[0:3]
+    if _is_annotated([node]):
+        return
+
+    _annotate_input_qspec_map(
+        node,
+        act,
+        quantization_config.input_activation,
+    )
+    # QNN requires uint8 instead of int8 in 'weight' config
+    _annotate_input_qspec_map(
+        node,
+        weight,
+        quantization_config.input_activation,
+    )
+    _annotate_input_qspec_map(
+        node,
+        bias,
+        quantization_config.bias,
+    )
+    _annotate_output_qspec(node, quantization_config.output_activation)
+    _mark_nodes_as_annotated([node, *node.args[0:3]])
+
+
+@register_annotator([operator.getitem])
+def annotate_getitem(node: Node, quantization_config: QuantizationConfig) -> None:
+    _annotate_output_qspec(node, quantization_config.output_activation)
+    _mark_nodes_as_annotated([node])
+
+
 @register_annotator([torch.ops.aten.layer_norm.default])
 def annotate_layer_norm(node: Node, quantization_config: QuantizationConfig) -> None:
     act_node = node.args[0]
diff --git a/backends/qualcomm/runtime/QnnExecuTorch.h b/backends/qualcomm/runtime/QnnExecuTorch.h
index 45525726ca7..dabd4cdde5f 100644
--- a/backends/qualcomm/runtime/QnnExecuTorch.h
+++ b/backends/qualcomm/runtime/QnnExecuTorch.h
@@ -44,7 +44,7 @@ struct CustomMemTensorInfo {
   size_t tensor_bytes;
   uint32_t* shape;
   uint32_t rank;
-  torch::executor::ScalarType dtype;
+  exec_aten::ScalarType dtype;
 };
 
 /// Allocate specific tensors (usually graph inputs and outputs) on shared
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
index 36512c4ff21..f5c9473411e 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -213,8 +213,10 @@ Error QnnExecuTorchBackend::execute(
   }
 
   ET_CHECK_OR_RETURN_ERROR(
-      qnn_manager->Execute(input_tensor_structs, output_tensor_structs) ==
-          Error::Ok,
+      qnn_manager->Execute(
+          input_tensor_structs,
+          output_tensor_structs,
+          context.event_tracer()) == Error::Ok,
       Internal,
       "Fail to execute graph");
   ET_CHECK_OR_RETURN_ERROR(
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.h b/backends/qualcomm/runtime/QnnExecuTorchBackend.h
index ed4d35068dc..fbcc7058894 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.h
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.h
@@ -14,7 +14,8 @@
 namespace torch {
 namespace executor {
 
-class QnnExecuTorchBackend final : public PyTorchBackendInterface {
+class QnnExecuTorchBackend final
+    : public ::executorch::runtime::BackendInterface {
  public:
   ~QnnExecuTorchBackend(){};
 
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index 3027c184d95..f4275f0ab3d 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -8,7 +8,9 @@
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
 #include <executorch/backends/qualcomm/runtime/SharedBuffer.h>
 #include <executorch/backends/qualcomm/runtime/Utils.h>
+#include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <algorithm>
 #include <cstdlib>
 #include <cstring>
@@ -56,9 +58,7 @@ QnnManager::QnnManager(
         "backend_type: %s", EnumNameQnnExecuTorchBackendType(backend_type));
     QNN_EXECUTORCH_LOG_INFO("graph_name: %s", options_->graph_name()->c_str());
     QNN_EXECUTORCH_LOG_INFO("library_path: %s", library_path.c_str());
-    QNN_EXECUTORCH_LOG_INFO(
-        "tensor_dump_output_path: %s",
-        options_->tensor_dump_output_path()->c_str());
+    QNN_EXECUTORCH_LOG_INFO("dump intermediate outputs: %s", IsTensorDump());
     QNN_EXECUTORCH_LOG_INFO(
         "log_level: %s", EnumNameQnnExecuTorchLogLevel(options_->log_level()));
     QNN_EXECUTORCH_LOG_INFO(
@@ -281,6 +281,8 @@ Error QnnManager::Init() {
         options_->backend_options()->backend_type());
     backend_params_ptr_ = QnnBackendFactory().Create(
         qnn_loaded_backend_, logger_.get(), qnn_context_blob_, options_);
+    ET_CHECK_OR_RETURN_ERROR(
+        backend_params_ptr_ != nullptr, Internal, "Failed to load Qnn backend.")
     ET_CHECK_OR_RETURN_ERROR(
         backend_params_ptr_->qnn_backend_ptr_->Configure() == Error::Ok,
         Internal,
@@ -363,7 +365,8 @@ Error QnnManager::AllocateTensor(
 
 Error QnnManager::Execute(
     const std::vector<Qnn_Tensor_t>& input_tensor_structs,
-    std::vector<Qnn_Tensor_t>& output_tensor_structs) {
+    std::vector<Qnn_Tensor_t>& output_tensor_structs,
+    EventTracer* event_tracer) {
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
   error = backend_params_ptr_->qnn_graph_ptr_->GraphExecute(
@@ -374,30 +377,27 @@ Error QnnManager::Execute(
         "qnn_graph_execute failed. Error %d", QNN_GET_ERROR_CODE(error));
     return Error::Internal;
   }
-
   if (IsTensorDump()) {
     // TODO: Need to handle the graph which is partitioned.
     // Maybe we could use graph name.
-    std::string dir = options_->tensor_dump_output_path()->str() + "/Result/";
-    CreateDirectory(dir);
-    QNN_EXECUTORCH_LOG_INFO("Dump tensor to the path: %s", dir.c_str());
     for (std::size_t out_idx = 0; out_idx < output_tensor_structs.size();
          ++out_idx) {
       const Qnn_Tensor_t& output_tensor = output_tensor_structs[out_idx];
-
-      std::string output_path =
-          dir + QNN_VER_PTR(output_tensor)->name + "_tensor.raw";
-
-      std::ofstream fout(output_path, std::ios::binary);
-      if (fout.fail()) {
-        QNN_EXECUTORCH_LOG_ERROR(
-            "Dump tensor name: %s Failed.", QNN_VER_PTR(output_tensor)->name);
-        return Error::Internal;
-      }
-
-      fout.write(
-          static_cast<const char*>(QNN_VER_PTR(output_tensor)->clientBuf.data),
-          QNN_VER_PTR(output_tensor)->clientBuf.dataSize);
+      std::vector<exec_aten::SizesType> sizes(
+          QNN_VER_PTR(output_tensor)->dimensions,
+          QNN_VER_PTR(output_tensor)->dimensions +
+              QNN_VER_PTR(output_tensor)->rank);
+
+      auto dump_tensor = executorch::extension::from_blob(
+          QNN_VER_PTR(output_tensor)->clientBuf.data,
+          sizes,
+          qnn_dtype_to_scalar_type_[QNN_VER_PTR(output_tensor)->dataType]);
+
+      torch::executor::event_tracer_log_output_delegate<exec_aten::Tensor>(
+          event_tracer,
+          QNN_VER_PTR(output_tensor)->name,
+          /*delegate_debug_id=*/static_cast<torch::executor::DebugHandle>(-1),
+          *dump_tensor);
     }
   }
 
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
index 5190f6768b7..3d1cc3863aa 100644
--- a/backends/qualcomm/runtime/QnnManager.h
+++ b/backends/qualcomm/runtime/QnnManager.h
@@ -37,7 +37,8 @@ class QnnManager {
 
   Error Execute(
       const std::vector<Qnn_Tensor_t>& input_tensor_structs,
-      std::vector<Qnn_Tensor_t>& output_tensor_structs);
+      std::vector<Qnn_Tensor_t>& output_tensor_structs,
+      EventTracer* event_tracer);
 
   Error ProfileExecuteData(EventTracer* event_tracer);
 
@@ -52,7 +53,7 @@ class QnnManager {
   }
 
   bool IsTensorDump() {
-    return options_->tensor_dump_output_path()->size() > 0;
+    return options_->dump_intermediate_outputs();
   }
 
   bool IsNodeSupportedByBackend(
diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp
index 3fa62d09cdb..2b2a729835c 100644
--- a/backends/qualcomm/runtime/SharedBuffer.cpp
+++ b/backends/qualcomm/runtime/SharedBuffer.cpp
@@ -25,7 +25,7 @@ std::size_t std::hash<CustomMemTensorInfo>::operator()(
     hash_val ^= info.shape[i];
   }
   hash_val ^= std::hash<uint32_t>()(info.rank);
-  hash_val ^= std::hash<torch::executor::ScalarType>()(info.dtype);
+  hash_val ^= std::hash<exec_aten::ScalarType>()(info.dtype);
   return hash_val;
 }
 
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp b/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp
index 3e286c07b02..c67f9b52f5d 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp
@@ -53,6 +53,85 @@ Error QnnBackend::Configure() {
   }
   return Error::Ok;
 }
+
+Error QnnBackend::VerifyQNNSDKVersion(
+    const QnnExecuTorchBackendType backend_id) {
+  const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
+
+  Qnn_ApiVersion_t qnn_version = {QNN_VERSION_INIT};
+  Qnn_ErrorHandle_t error =
+      qnn_interface.qnn_backend_get_api_version(&qnn_version);
+  if (error != QNN_SUCCESS) {
+    QNN_EXECUTORCH_LOG_ERROR("Failed to get Qnn API version.");
+    return Error::Internal;
+  }
+
+  Qnn_ApiVersion_t expected_version = {QNN_VERSION_INIT};
+  expected_version.coreApiVersion.major = QNN_API_VERSION_MAJOR;
+  expected_version.coreApiVersion.minor = QNN_API_VERSION_MINOR;
+  expected_version.coreApiVersion.patch = QNN_API_VERSION_PATCH;
+  expected_version.backendApiVersion = GetExpectedBackendVersion();
+  const char* backend_type = EnumNameQnnExecuTorchBackendType(backend_id);
+
+  Error status = VersionChecker(
+      qnn_version.coreApiVersion, expected_version.coreApiVersion, "Qnn API");
+  if (status == Error::Ok) {
+    status = VersionChecker(
+        qnn_version.backendApiVersion,
+        expected_version.backendApiVersion,
+        backend_type);
+  }
+
+  return status;
+}
+
+Error QnnBackend::VersionChecker(
+    const Qnn_Version_t& qnn_version,
+    const Qnn_Version_t& expected,
+    const std::string& prefix) {
+  if (qnn_version.major != expected.major) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "%s version %u.%u.%u is not supported. "
+        "The minimum supported version is %u.%u.%u. Please make "
+        "sure you have the correct backend library version.",
+        prefix.c_str(),
+        qnn_version.major,
+        qnn_version.minor,
+        qnn_version.patch,
+        expected.major,
+        expected.minor,
+        expected.patch);
+    return Error::Internal;
+  }
+  if (qnn_version.major == QNN_API_VERSION_MAJOR &&
+      qnn_version.minor < expected.minor) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "%s version %u.%u.%u is mismatched. "
+        "The minimum supported version is %u.%u.%u. Please make "
+        "sure you have the correct backend library version.",
+        prefix.c_str(),
+        qnn_version.major,
+        qnn_version.minor,
+        qnn_version.patch,
+        expected.major,
+        expected.minor,
+        expected.patch);
+  }
+  if ((qnn_version.major == QNN_API_VERSION_MAJOR &&
+       qnn_version.minor > expected.minor)) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "%s version %u.%u.%u is used. "
+        "The version is tested against %u.%u.%u.",
+        prefix.c_str(),
+        qnn_version.major,
+        qnn_version.minor,
+        qnn_version.patch,
+        expected.major,
+        expected.minor,
+        expected.patch);
+  }
+  return Error::Ok;
+}
 } // namespace qnn
 } // namespace executor
 } // namespace torch
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCommon.h b/backends/qualcomm/runtime/backends/QnnBackendCommon.h
index e6ea0adff8b..de007898e5d 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCommon.h
+++ b/backends/qualcomm/runtime/backends/QnnBackendCommon.h
@@ -13,8 +13,10 @@
 
 #include <vector>
 
+#include "HTP/QnnHtpCommon.h"
 #include "QnnBackend.h"
 #include "QnnCommon.h"
+#include "QnnTypes.h"
 namespace torch {
 namespace executor {
 namespace qnn {
@@ -43,7 +45,10 @@ class QnnBackend {
     return handle_;
   }
 
+  Error VerifyQNNSDKVersion(const QnnExecuTorchBackendType backend_id);
+
  protected:
+  virtual Qnn_Version_t GetExpectedBackendVersion() const = 0;
   virtual Error MakeConfig(std::vector<const QnnBackend_Config_t*>& config) {
     return Error::Ok;
   };
@@ -52,6 +57,10 @@ class QnnBackend {
   Qnn_BackendHandle_t handle_;
   const QnnImplementation& implementation_;
   QnnLogger* logger_;
+  Error VersionChecker(
+      const Qnn_Version_t& qnn_version,
+      const Qnn_Version_t& expected,
+      const std::string& prefix);
 };
 } // namespace qnn
 } // namespace executor
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
index acb95524682..9fb292613a3 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
@@ -16,6 +16,7 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
     const QnnExecuTorchContextBinary& qnn_context_blob,
     const QnnExecuTorchOptions* options) {
   auto backend_params = std::make_unique<BackendConfigParameters>();
+
   switch (options->backend_options()->backend_type()) {
     case QnnExecuTorchBackendType::kHtpBackend: {
       auto htp_options = options->backend_options()->htp_options();
@@ -51,6 +52,7 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
       }
       backend_params->qnn_backend_ptr_ =
           std::make_unique<HtpBackend>(implementation, logger);
+
       backend_params->qnn_device_ptr_ = std::make_unique<HtpDevice>(
           implementation, logger, options->soc_info(), htp_options);
 
@@ -72,7 +74,6 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
       backend_params->qnn_mem_manager_ptr_ = std::make_unique<QnnMemManager>(
           implementation, backend_params->qnn_context_ptr_.get());
       backend_params->backend_init_state_ = BackendInitializeState::INITIALIZED;
-      return backend_params;
     } break;
     case QnnExecuTorchBackendType::kGpuBackend:
     case QnnExecuTorchBackendType::kDspBackend:
@@ -81,7 +82,11 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
       return nullptr;
   }
 
-  // should not reach here
+  if (backend_params->qnn_backend_ptr_->VerifyQNNSDKVersion(
+          options->backend_options()->backend_type()) == Error::Ok) {
+    return backend_params;
+  }
+
   return nullptr;
 }
 } // namespace qnn
diff --git a/backends/qualcomm/runtime/backends/QnnProfiler.cpp b/backends/qualcomm/runtime/backends/QnnProfiler.cpp
index fa5829d23b8..ae336a800b6 100644
--- a/backends/qualcomm/runtime/backends/QnnProfiler.cpp
+++ b/backends/qualcomm/runtime/backends/QnnProfiler.cpp
@@ -7,7 +7,6 @@
  */
 
 #include <executorch/backends/qualcomm/runtime/backends/QnnProfiler.h>
-#include <iostream>
 
 namespace torch {
 namespace executor {
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h b/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h
index d4b14178a43..d00bd50cdc3 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h
@@ -8,7 +8,9 @@
 #pragma once
 
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
+#include "HTP/QnnHtpCommon.h"
 #include "HTP/QnnHtpProfile.h"
+#include "QnnTypes.h"
 namespace torch {
 namespace executor {
 namespace qnn {
@@ -24,6 +26,14 @@ class HtpBackend : public QnnBackend {
         event_type == QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE);
   }
 
+  Qnn_Version_t GetExpectedBackendVersion() const override {
+    Qnn_Version_t backend_version;
+    backend_version.major = QNN_HTP_API_VERSION_MAJOR;
+    backend_version.minor = QNN_HTP_API_VERSION_MINOR;
+    backend_version.patch = QNN_HTP_API_VERSION_PATCH;
+    return backend_version;
+  }
+
  protected:
   Error MakeConfig(std::vector<const QnnBackend_Config_t*>& config) override {
     return Error::Ok;
diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl
index 77449e95e2a..61650fab268 100644
--- a/backends/qualcomm/runtime/targets.bzl
+++ b/backends/qualcomm/runtime/targets.bzl
@@ -63,5 +63,6 @@ def define_common_targets():
             "//executorch/backends/qualcomm/aot/wrappers:wrappers",
             "//executorch/runtime/backend:interface",
             "//executorch/runtime/core:core",
+            "//executorch/extension/tensor:tensor",
         ],
     )
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
index aafd6252e79..4cb2f50bbd2 100755
--- a/backends/qualcomm/scripts/build.sh
+++ b/backends/qualcomm/scripts/build.sh
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 set -e
+set -o xtrace
 
 if [[ -z ${QNN_SDK_ROOT} ]]; then
     echo "Please export QNN_SDK_ROOT=/path/to/qnn_sdk"
@@ -70,7 +71,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT
     else
         # Force rebuild flatccrt for the correct platform
-        cd $BUILD_ROOT/sdk && make clean
+        cd $BUILD_ROOT/devtools && make clean
     fi
 
     cd $BUILD_ROOT
@@ -78,8 +79,9 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \
         -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
         -DEXECUTORCH_BUILD_QNN=ON \
-        -DEXECUTORCH_BUILD_SDK=ON \
+        -DEXECUTORCH_BUILD_DEVTOOLS=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
@@ -112,7 +114,7 @@ if [ "$BUILD_X86_64" = true ]; then
         rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT
     else
         # Force rebuild flatccrt for the correct platform
-        cd $BUILD_ROOT/sdk && make clean
+        cd $BUILD_ROOT/devtools && make clean
     fi
 
     cd $BUILD_ROOT
@@ -121,8 +123,9 @@ if [ "$BUILD_X86_64" = true ]; then
         -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \
         -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
         -DEXECUTORCH_BUILD_QNN=ON \
-        -DEXECUTORCH_BUILD_SDK=ON \
+        -DEXECUTORCH_BUILD_DEVTOOLS=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
         -S $PRJ_ROOT \
diff --git a/backends/qualcomm/serialization/qnn_compile_spec_schema.py b/backends/qualcomm/serialization/qnn_compile_spec_schema.py
index 338f61997ea..8471aad982d 100644
--- a/backends/qualcomm/serialization/qnn_compile_spec_schema.py
+++ b/backends/qualcomm/serialization/qnn_compile_spec_schema.py
@@ -129,7 +129,7 @@ class QnnExecuTorchOptions:
     library_path: str = ""
     log_level: QnnExecuTorchLogLevel = QnnExecuTorchLogLevel.kLogOff
     online_prepare: bool = False
-    tensor_dump_output_path: str = ""
+    dump_intermediate_outputs: bool = False
     profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff
     shared_buffer: bool = False
     is_from_context_binary: bool = False
diff --git a/backends/qualcomm/serialization/schema.fbs b/backends/qualcomm/serialization/schema.fbs
index 4288c83b130..4e7fdb56e89 100644
--- a/backends/qualcomm/serialization/schema.fbs
+++ b/backends/qualcomm/serialization/schema.fbs
@@ -164,11 +164,9 @@ table QnnExecuTorchOptions {
   /// Check if on-device graph construction. Default is false.
   online_prepare:bool;
 
-  /// Tensor dump output path. If a path is given, Delegate would write
-  /// outputs of each OP there.
-  /// In ALL cases, we don't recommend to set this option.
-  /// This option exist just for debugging some accuracy issues.
-  tensor_dump_output_path:string;
+  /// If tensor dump is enabled, all intermediate tensors output will be dumped.
+  /// This option exists for debugging accuracy issues. Default is off.
+  dump_intermediate_outputs:bool;
 
   /// Profiling level of the delegate and the backend. Default is off.
   profile_level:QnnExecuTorchProfileLevel;
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index 319cc6092cd..e448a219284 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -55,6 +55,16 @@ def forward(self, x):
         return self.avgPool(x)
 
 
+class BatchNorm(torch.nn.Module):
+    def __init__(self, n_features):
+        super().__init__()
+        self.native_batchnorm = torch.nn.BatchNorm2d(n_features)
+        self.eval()
+
+    def forward(self, x):
+        return self.native_batchnorm(x)
+
+
 class Bmm(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -734,6 +744,16 @@ def forward(self, x):
         )
 
 
+class RmsNorm(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.eps = 1e-5
+        self.rms = torch.nn.RMSNorm([4], 1e-5)
+
+    def forward(self, x):
+        return self.rms(x)
+
+
 class Rsqrt(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index dd704c35c08..d022ac96c48 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -16,6 +16,7 @@
 from executorch.backends.qualcomm.tests.utils import (
     generate_context_binary,
     QnnPartitioner,
+    QnnQuantizer,
     QuantDtype,
     TestQNN,
     to_backend,
@@ -33,6 +34,7 @@
     from_context_binary,
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
+    skip_annotation,
 )
 
 from executorch.examples.qualcomm.utils import setup_common_args_and_variables
@@ -50,8 +52,8 @@
 from executorch.examples.models.mobilenet_v3 import MV3Model
 from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel
 from executorch.examples.models.wav2letter import Wav2LetterModel
+from executorch.exir import to_edge
 from executorch.exir.backend.backend_api import disable_validation
-from executorch.exir.program._program import EdgeCompileConfig, ExirExportedProgram
 
 
 class TestQNNFloatingPointOperator(TestQNN):
@@ -66,7 +68,7 @@ def setUp(self):
             debug=False,
             saver=False,
             online_prepare=TestQNN.online_prepare,
-            tensor_dump_output_path="",
+            dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
             profile=TestQNN.enable_profile,
             shared_buffer=TestQNN.shared_buffer,
         )
@@ -81,6 +83,11 @@ def test_qnn_backend_avg_pool2d(self):
         sample_input = (torch.randn(1, 3, 2, 2),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_batch_norm(self):
+        module = BatchNorm(32)  # noqa: F405
+        sample_input = (torch.randn([4, 32, 16, 16]),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_bmm(self):
         module = Bmm()  # noqa: F405
         torch.manual_seed(8)
@@ -291,7 +298,6 @@ def test_qnn_backend_layer_norm(self):
         sample_input = (torch.randn(196, 768),)
         self.lower_module_and_test_output(module, sample_input)
 
-    @unittest.skip("only works on QNN 2.17")
     def test_qnn_backend_leaky_relu(self):
         test_comb = [
             {
@@ -334,14 +340,12 @@ def test_qnn_backend_mean_dim(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
-    @unittest.skip("it will hang in runtime")
+    @unittest.skip("failed to lower in QNN 2.26")
     def test_qnn_backend_mha(self):
         module = MultiheadAttention()  # noqa: F405
         sample_input = (torch.randn(1, 197, 96),)
         self.lower_module_and_test_output(module, sample_input)
 
-    # fp16 pad op might hit corner case in runtime
-    @unittest.expectedFailure
     def test_qnn_backend_pad(self):
         module = Pad()  # noqa: F405
         sample_input = (torch.randn([1, 8, 128]),)
@@ -362,7 +366,6 @@ def test_qnn_backend_pow_tensor_scalar(self):
         sample_input = (torch.rand([2, 4, 3, 3]),)
         self.lower_module_and_test_output(module, sample_input)
 
-    @unittest.skip("only works on QNN 2.17")
     def test_qnn_backend_prelu(self):
         test_comb = [
             {
@@ -393,6 +396,11 @@ def test_qnn_backend_reshape(self):
         sample_input = (torch.randn([3, 4]),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_rms_norm(self):
+        module = RmsNorm()  # noqa: F405
+        sample_input = (torch.abs(torch.randn([1, 1, 1, 4])),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_rsqrt(self):
         module = Rsqrt()  # noqa: F405
         sample_input = (torch.abs(torch.randn([3, 4])),)
@@ -482,7 +490,7 @@ def setUp(self):
             debug=False,
             saver=False,
             online_prepare=TestQNN.online_prepare,
-            tensor_dump_output_path="",
+            dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
             profile=TestQNN.enable_profile,
             shared_buffer=TestQNN.shared_buffer,
         )
@@ -596,7 +604,7 @@ def setUp(self):
             debug=False,
             saver=False,
             online_prepare=TestQNN.online_prepare,
-            tensor_dump_output_path="",
+            dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
             profile=TestQNN.enable_profile,
             shared_buffer=TestQNN.shared_buffer,
         )
@@ -621,6 +629,7 @@ def test_qnn_backend_16a4w_linear(self):
         )
         self.lower_module_and_test_output(module, sample_input)
 
+    @unittest.skip("segfault happens in QNN 2.26")
     def test_qnn_backend_16a4w_per_channel_linear(self):
         module = Linear(use_bias=False)  # noqa: F405
         sample_input = (torch.randn([3, 4]),)
@@ -655,6 +664,12 @@ def test_qnn_backend_avg_pool2d(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_batch_norm(self):
+        module = BatchNorm(32)  # noqa: F405
+        sample_input = (torch.randn([4, 32, 16, 16]),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_bmm(self):
         module = Bmm()  # noqa: F405
         torch.manual_seed(8)
@@ -662,13 +677,6 @@ def test_qnn_backend_bmm(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
-    @unittest.skip("not applicable")
-    def test_qnn_backend_cast(self):
-        module = Cast()  # noqa: F405
-        sample_input = (10 * torch.rand((9, 4, 5, 3)),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
-
     def test_qnn_backend_cat(self):
         modules = [Cat2(), Cat3(), Cat4()]  # noqa: F405
         sample_input = (torch.randn(1, 1, 2, 2), torch.randn(1, 1, 4, 2))
@@ -1000,6 +1008,14 @@ def test_qnn_backend_reshape(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_rms_norm(self):
+        module = RmsNorm()  # noqa: F405
+        sample_input = (torch.abs(torch.randn([1, 1, 1, 4])),)
+        module = self.get_qdq_module(
+            module, sample_input, quant_dtype=QuantDtype.use_16a4w
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_rsqrt(self):
         module = Rsqrt()  # noqa: F405
         sample_input = (torch.abs(torch.randn([3, 4])),)
@@ -1105,7 +1121,7 @@ def setUp(self):
             debug=False,
             saver=False,
             online_prepare=TestQNN.online_prepare,
-            tensor_dump_output_path="",
+            dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
             profile=TestQNN.enable_profile,
             shared_buffer=TestQNN.shared_buffer,
         )
@@ -1271,6 +1287,22 @@ def setUp(self):
             saver=False,
         )
 
+    def test_qnn_backend_dump_intermediate_outputs(self):
+        backend_options = generate_htp_compiler_spec(use_fp16=True)
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            dump_intermediate_outputs=True,
+        )
+        module = Relu()  # noqa: F405
+        sample_input = (torch.randn([2, 5, 1, 3]),)
+        self.lower_module_and_test_output(
+            module,
+            sample_input,
+            expected_partitions=1,
+            expected_intermediate_events=3,
+        )
+
     def test_qnn_backend_skip_node_id(self):
         module = SimpleModel()  # noqa: F405
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
@@ -1329,16 +1361,10 @@ def test_qnn_backend_multi_contexts_composite(self):
             lowered_method=to_backend,
         )
         sample_input = module.get_random_input()
-        edge_prog = ExirExportedProgram(
+        edge_prog = to_edge(
             torch.export.export(module, sample_input),
-            after_to_edge_passes=False,
-        ).to_edge(
-            EdgeCompileConfig(
-                _check_ir_validity=False,
-                _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
-            )
         )
-        canonicalize_program(edge_prog.exported_program)
+        canonicalize_program(edge_prog.exported_program())
         exec_prog = edge_prog.to_executorch()
         self.verify_output(module.get_reference_module(), sample_input, exec_prog)
 
@@ -1388,6 +1414,7 @@ def test_qnn_backend_online_prepare(self):
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
         self.lower_module_and_test_output(module, sample_input)
 
+    @unittest.skip("segfault happens in recent torch.export.export")
     def test_qnn_backend_context_direct(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             module = ContextBinaryExample()  # noqa: F405
@@ -1431,7 +1458,24 @@ def setUp(self):
             saver=False,
         )
 
-    def test_qnn_backend_skip_node_id(self):
+    def test_qnn_backend_dump_intermediate_outputs(self):
+        backend_options = generate_htp_compiler_spec(use_fp16=False)
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            dump_intermediate_outputs=True,
+        )
+        module = Relu()  # noqa: F405
+        sample_input = (torch.randn([2, 5, 1, 3]),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(
+            module,
+            sample_input,
+            expected_partitions=1,
+            expected_intermediate_events=5,
+        )
+
+    def test_qnn_backend_skip_node_id_partitioner(self):
         module = SimpleModel()  # noqa: F405
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
         module = self.get_qdq_module(module, sample_input)
@@ -1442,7 +1486,43 @@ def test_qnn_backend_skip_node_id(self):
             skip_node_id_set={"aten_add_tensor", "aten_mean_dim"},
         )
 
-    def test_qnn_backend_skip_node_op(self):
+    def test_qnn_backend_skip_node_id_quantizer(self):
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+
+        # define partitioner
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=False,
+        )
+        compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+        )
+        partitioner = QnnPartitioner(compiler_specs)
+        # define quantizer
+        quantizer = QnnQuantizer()
+
+        # define calibration method
+        def calibrator(gm):
+            gm(*sample_input)
+
+        # get partially lowererd graph module
+        graph_module, exported_progs = skip_annotation(
+            nn_module=module,
+            quantizer=quantizer,
+            partitioner=partitioner,
+            sample_input=sample_input,
+            calibration_cb=calibrator,
+            fp_node_id_set={"conv2d"},
+        )
+        self.assertEqual(len(exported_progs), 1)
+        # lower all graph again, the skipped operators will be left in CPU
+        exec_prog = to_edge(
+            torch.export.export(graph_module, sample_input),
+        ).to_executorch()
+        self.verify_output(module, sample_input, exec_prog)
+
+    def test_qnn_backend_skip_node_op_partitioner(self):
         module = SimpleModel()  # noqa: F405
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
         module = self.get_qdq_module(module, sample_input)
@@ -1453,6 +1533,79 @@ def test_qnn_backend_skip_node_op(self):
             skip_node_op_set={"aten.add.Tensor"},
         )
 
+    def test_qnn_backend_skip_node_op_quantizer(self):
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+
+        # define partitioner
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=False,
+        )
+        compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+        )
+        partitioner = QnnPartitioner(compiler_specs)
+        # define quantizer
+        quantizer = QnnQuantizer()
+
+        # define calibration method
+        def calibrator(gm):
+            gm(*sample_input)
+
+        # get partially lowererd graph module
+        graph_module, exported_progs = skip_annotation(
+            nn_module=module,
+            quantizer=quantizer,
+            partitioner=partitioner,
+            sample_input=sample_input,
+            calibration_cb=calibrator,
+            fp_node_op_set={torch.ops.aten.add.Tensor},
+        )
+        self.assertEqual(len(exported_progs), 2)
+        # lower all graph again, the skipped operators will be left in CPU
+        exec_prog = exec_prog = to_edge(
+            torch.export.export(graph_module, sample_input),
+        ).to_executorch()
+        self.verify_output(module, sample_input, exec_prog)
+
+    def test_qnn_backend_graph_level_mixed_precision(self):
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+
+        # define partitioner
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=False,
+        )
+        compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+        )
+        partitioner = QnnPartitioner(compiler_specs)
+        # define quantizer
+        quantizer = QnnQuantizer()
+
+        # define calibration method
+        def calibrator(gm):
+            gm(*sample_input)
+
+        # get partially lowererd graph module
+        graph_module, exported_progs = skip_annotation(
+            nn_module=module,
+            quantizer=quantizer,
+            partitioner=partitioner,
+            sample_input=sample_input,
+            calibration_cb=calibrator,
+            fp_node_id_set={"add", "mean"},
+            fallback_to_cpu=False,
+        )
+        self.assertEqual(len(exported_progs), 5)
+        # lower all graph again, the skipped operators will be delegated with fp16
+        exec_prog = to_edge(
+            torch.export.export(graph_module, sample_input),
+        ).to_executorch()
+        self.verify_output(module, sample_input, exec_prog)
+
     def test_qnn_backend_multi_contexts(self):
         module = SimpleModel()  # noqa: F405
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
@@ -1493,16 +1646,10 @@ def test_qnn_backend_multi_contexts_composite(self):
             quantize_method=self.get_qdq_module,
         )
         sample_input = module.get_random_input()
-        edge_prog = ExirExportedProgram(
+        edge_prog = to_edge(
             torch.export.export(module, sample_input),
-            after_to_edge_passes=False,
-        ).to_edge(
-            EdgeCompileConfig(
-                _check_ir_validity=False,
-                _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
-            )
         )
-        canonicalize_program(edge_prog.exported_program)
+        canonicalize_program(edge_prog.exported_program())
         exec_prog = edge_prog.to_executorch()
         self.verify_output(module.get_reference_module(), sample_input, exec_prog)
 
@@ -1555,6 +1702,7 @@ def test_qnn_backend_online_prepare(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    @unittest.skip("segfault happens in recent torch.export.export")
     def test_qnn_backend_context_direct(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             module = ContextBinaryExample()  # noqa: F405
@@ -1668,6 +1816,46 @@ def test_gMLP(self):
                 self.assertGreaterEqual(msg["top_1"], 60)
                 self.assertGreaterEqual(msg["top_5"], 90)
 
+    def test_regnet(self):
+        if not self.required_envs([self.image_dataset]):
+            self.skipTest("missing required envs")
+
+        weights = ["regnet_y_400mf", "regnet_x_400mf"]
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/regnet.py",
+            "--dataset",
+            self.image_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        for weight in weights:
+            p = subprocess.Popen(
+                cmds + ["--weights", weight], stdout=subprocess.DEVNULL
+            )
+            with Listener((self.ip, self.port)) as listener:
+                conn = listener.accept()
+                p.communicate()
+                msg = json.loads(conn.recv())
+                if "Error" in msg:
+                    self.fail(msg["Error"])
+                else:
+                    self.assertGreaterEqual(msg["top_1"], 60)
+                    self.assertGreaterEqual(msg["top_5"], 85)
+
     def test_ssd300_vgg16(self):
         if not self.required_envs([self.pretrained_weight, self.oss_repo]):
             self.skipTest("missing required envs")
@@ -1996,7 +2184,61 @@ def test_llama3_8b(self):
                 self.fail(msg["Error"])
             else:
                 model_out = msg["result"]
-                self.assertTrue(model_out.startswith(prompt))
+                expected_result = (
+                    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
+                    + prompt
+                    + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+                )
+                self.assertTrue(model_out.startswith(expected_result))
+
+    def test_stable_diffusion(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+
+        prompt = "a photo of an astronaut riding a horse on mars"
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--text_encoder_bin",
+            f"{self.artifact_dir}/text_encoder.serialized.bin",
+            "--unet_bin",
+            f"{self.artifact_dir}/unet.serialized.bin",
+            "--vae_bin",
+            f"{self.artifact_dir}/vae.serialized.bin",
+            "--vocab_json",
+            f"{self.artifact_dir}/vocab.json",
+            "--num_time_steps",
+            "20",
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--fix_latents",
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                # For the default settings and prompt, the expected results will be {PSNR: 23.258, SSIM: 0.852}
+                self.assertGreaterEqual(msg["PSNR"], 20)
+                self.assertGreaterEqual(msg["SSIM"], 0.8)
 
 
 class TestExampleScript(TestQNN):
@@ -2324,6 +2566,7 @@ def test_stories_single_llama(self):
                 model_out = msg["result"][0]
                 self.assertTrue(model_out.startswith(golden_start_with))
 
+    @unittest.skip("dynamic shape inputs appear in recent torch.export.export")
     def test_mobilebert(self):
         if not self.required_envs([self.pretrained_weight]):
             self.skipTest("missing required envs")
@@ -2364,13 +2607,8 @@ def test_mobilebert(self):
                 for k, v in cpu.items():
                     self.assertLessEqual(abs(v[0] - htp[k][0]), 2)
 
-    @unittest.skip("will be enabled after TODOs got resolved")
+    @unittest.skip("eagar mode fake quant works well, need further investigation")
     def test_ptq_mobilebert(self):
-        # TODO: 2 approaches to resolve accuracy issue
-        # 1. fallback embedding layers:
-        #    - skip annotation in quantizer (need PR to provide helper funciton)
-        #    - skip operators in partitioner (use existent "skip_node_op_set")
-        # 2. investigate different quantization configurations / mechanisms
         if not self.required_envs([self.pretrained_weight]):
             self.skipTest("missing required envs")
 
@@ -2387,6 +2625,8 @@ def test_ptq_mobilebert(self):
             self.model,
             "--pretrained_weight",
             self.pretrained_weight,
+            "--ptq",
+            "16a16w",
             "--ip",
             self.ip,
             "--port",
@@ -2513,6 +2753,7 @@ def setup_environment():
     TestQNN.oss_repo = args.oss_repo
     TestQNN.shared_buffer = args.shared_buffer
     TestQNN.enable_x86_64 = args.enable_x86_64
+    TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
     return sys.argv[:1] + ns_args
 
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 5fd6d5ad196..7209b0a2678 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -27,6 +27,7 @@
     QcomChipset,
 )
 from executorch.backends.qualcomm.utils.utils import capture_program
+from executorch.devtools import generate_etrecord, Inspector
 from executorch.examples.qualcomm.utils import (
     generate_inputs,
     make_output_dir,
@@ -39,9 +40,7 @@
 from executorch.exir.lowered_backend_module import LoweredBackendModule
 from executorch.exir.pass_base import ExportPass
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
-from executorch.exir.program._program import ExecutorchProgram
-from executorch.sdk import generate_etrecord
-from executorch.sdk.inspector import Inspector
+from executorch.exir.program import ExecutorchProgram, ExecutorchProgramManager
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
@@ -181,18 +180,21 @@ def _save_model_and_expected_output(
 
         return input_list, ref_outputs, pte_fname
 
-    def verify_output(
+    def verify_output(  # noqa: C901
         self,
         module: torch.nn.Module,
         sample_inputs: Tuple[torch.Tensor],
         executorch_prog: ExecutorchProgram | LoweredBackendModule,
         etrecord_path: str = "etrecord.bin",
         expected_profile_events: int = -1,
+        expected_intermediate_events: int = -1,
     ):
         with tempfile.TemporaryDirectory() as tmp_dir:
             buffer = (
                 executorch_prog.buffer
-                if isinstance(executorch_prog, ExecutorchProgram)
+                if isinstance(
+                    executorch_prog, (ExecutorchProgram, ExecutorchProgramManager)
+                )
                 else executorch_prog.buffer()
             )
             (
@@ -209,6 +211,7 @@ def verify_output(
             output_dir = f"{tmp_dir}/outputs"
             outputs = []
             etdump_path = f"{tmp_dir}/etdump.etdp"
+            debug_output_path = f"{tmp_dir}/debug_output.bin"
 
             def post_process():
                 for i, f in enumerate(sorted(os.listdir(output_dir))):
@@ -223,6 +226,16 @@ def validate_profile():
                     len(inspector.to_dataframe().index) == expected_profile_events
                 )
 
+            def validate_intermediate_tensor():
+                inspector = Inspector(
+                    etdump_path=etdump_path, debug_buffer_path=debug_output_path
+                )
+                for event_block in inspector.event_blocks:
+                    if event_block.name == "Execute":
+                        self.assertTrue(
+                            len(event_block.events) == expected_intermediate_events
+                        )
+
             if self.enable_x86_64:
                 generate_inputs(tmp_dir, "input_list.txt", [sample_inputs], input_list)
                 make_output_dir(output_dir)
@@ -275,6 +288,9 @@ def validate_profile():
                 # Verify the etdump
                 if expected_profile_events != -1:
                     validate_profile()
+
+                if expected_intermediate_events != -1:
+                    validate_intermediate_tensor()
             else:
                 adb = SimpleADB(
                     qnn_sdk=os.getenv("QNN_SDK_ROOT"),
@@ -285,6 +301,9 @@ def validate_profile():
                     host_id=self.host,
                     soc_model=self.model,
                     error_only=self.error_only,
+                    dump_intermediate_outputs=(
+                        True if expected_intermediate_events != -1 else False
+                    ),
                 )
                 adb.push(inputs=[sample_inputs], input_list=input_list)
                 adb.execute()
@@ -294,12 +313,20 @@ def validate_profile():
                 if expected_profile_events != -1:
                     adb.pull_etdump(etdump_path, callback=validate_profile)
 
+                if expected_intermediate_events != -1:
+                    adb.pull_debug_output(
+                        etdump_path,
+                        debug_output_path,
+                        callback=validate_intermediate_tensor,
+                    )
+
     def lower_module_and_test_output(
         self,
         module: torch.nn.Module,
         sample_inputs: Tuple[torch.Tensor],
         expected_partitions: int = 1,
         expected_profile_events: int = -1,
+        expected_intermediate_events: int = -1,
         assert_output_equal: bool = True,
         skip_node_id_set: set = None,
         skip_node_op_set: set = None,
@@ -323,7 +350,6 @@ def lower_module_and_test_output(
                 # Therefore, won't want to pre-allocate
                 # by memory manager in runtime.
                 memory_planning_pass=MemoryPlanningPass(
-                    memory_planning_algo="greedy",
                     alloc_graph_input=not self.shared_buffer,
                     alloc_graph_output=not self.shared_buffer,
                 ),
@@ -344,11 +370,19 @@ def lower_module_and_test_output(
         etrecord_path = "etrecord.bin"
         if self.enable_profile:
             generate_etrecord(etrecord_path, edge_copy, exec_prog)
-
         # Check numerics
-        if assert_output_equal or expected_profile_events != -1:
+        if (
+            assert_output_equal
+            or expected_profile_events != -1
+            or expected_intermediate_events != -1
+        ):
             self.verify_output(
-                module, sample_inputs, exec_prog, etrecord_path, expected_profile_events
+                module,
+                sample_inputs,
+                exec_prog,
+                etrecord_path,
+                expected_profile_events,
+                expected_intermediate_events,
             )
 
     def get_qdq_module(
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 6dc0c4c3c8d..a0c0abf7295 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import operator
+import warnings
 from collections import OrderedDict
 from typing import Callable, Dict, List, Tuple
 
@@ -38,7 +40,11 @@
 from executorch.backends.qualcomm.passes.recompose_pixel_unshuffle import (
     RecomposePixelUnshuffle,
 )
+from executorch.backends.qualcomm.passes.recompose_rms_norm import RecomposeRmsNorm
 from executorch.backends.qualcomm.passes.remove_redundancy import RemoveRedundancy
+from executorch.backends.qualcomm.passes.replace_index_put_input import (
+    ReplaceIndexPutInput,
+)
 from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
     _soc_info_table,
     QcomChipset,
@@ -56,6 +62,7 @@
     convert_to_option,
 )
 from executorch.backends.qualcomm.utils.constants import QCOM_QNN_COMPILE_SPEC
+
 from executorch.exir import ExirExportedProgram
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.lowered_backend_module import LoweredBackendModule
@@ -63,9 +70,74 @@
 from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions
 from torch.export.exported_program import ExportedProgram
 from torch.fx import passes
+from torch.fx.passes.operator_support import OperatorSupportBase
 from torch.library import Library
 
 
+class _AnnotationSkipper(OperatorSupportBase):
+    """
+    Class used to partition out unwanted graph nodes.
+    e.g. - nodes are prevented from quantization annotation
+         - nodes have been grouped together as a submodule
+
+    Attributes
+    ----------
+    fp_node_id_set : set
+        a set contains nodes' name to be left in fp precision
+    fp_node_op_set : set
+        a set contains nodes' target (aten dialect) to be left in fp precision
+    skip_annotated_submodule : bool
+        flag to skip annotated submodule or not
+
+    Methods
+    -------
+    should_delegate(n: torch.fx.Node)
+        identify the residual nodes haven't be lowered with fixed-precision
+    should_skip(n: torch.fx.Node)
+        identify the nodes should be kept out with fixed-precision or not
+    is_node_supported(_, node: torch.fx.Node)
+        overridden method for graph partitioning
+    """
+
+    def __init__(
+        self,
+        fp_node_id_set: set = None,
+        fp_node_op_set: set = None,
+        skip_annotated_submodule: bool = False,
+    ):
+        self.fp_node_id_set = fp_node_id_set
+        self.fp_node_op_set = fp_node_op_set
+        self.skip_annotated_submodule = skip_annotated_submodule
+
+    def should_delegate(self, n: torch.fx.Node):
+        return n.op == "call_function" and n.target != operator.getitem
+
+    def should_skip(self, n: torch.fx.Node):
+        return n.name in self.fp_node_id_set or n.target in self.fp_node_op_set
+
+    def is_node_supported(self, _, node: torch.fx.Node) -> bool:
+        if self.skip_annotated_submodule:
+            if node.op == "get_attr":
+                return all(self.should_delegate(user) for user in node.users)
+            return self.should_delegate(node)
+
+        if any(
+            [
+                node.op in ("placeholder", "output"),
+                self.should_skip(node),
+                # check if parameters belong to fallbacked operator
+                (
+                    node.op == "get_attr"
+                    and all(self.should_skip(user) for user in node.users)
+                ),
+            ]
+        ):
+            print(f"[QNN Quantizer Annotation]: {node.name} | Skipped")
+            return False
+
+        return True
+
+
 def qnn_capture_config():
     return exir.CaptureConfig(enable_aot=True)
 
@@ -184,8 +256,10 @@ def get_decomp_table() -> Dict[torch._ops.OperatorBase, Callable]:
     # The below super ops are supported by QNN
     remove_decompositions = [
         torch.ops.aten.pixel_shuffle.default,
+        torch.ops.aten.pixel_unshuffle.default,
         torch.ops.aten.hardsigmoid.default,
         torch.ops.aten.hardswish.default,
+        torch.ops.aten._safe_softmax.default,
     ]
 
     for key in remove_decompositions:
@@ -201,6 +275,7 @@ def _transform(edge_program: ExportedProgram) -> None:
     graph_module = edge_program.graph_module
     RemoveRedundancy()(graph_module)
     RecomposePixelUnshuffle()(graph_module)
+    RecomposeRmsNorm()(graph_module)
     ConvertToLinear()(graph_module)
     ConvertPReLU(edge_program)(graph_module)
     ConvertBmmToMatmul()(graph_module)
@@ -211,6 +286,7 @@ def _transform(edge_program: ExportedProgram) -> None:
     AnnotateDecomposed(edge_program)(graph_module)
     FoldQDQ()(graph_module)
     LayoutTransform(edge_program)(graph_module)
+    ReplaceIndexPutInput(edge_program)(graph_module)
 
     # Since QDQ nodes are stripped, update graph signature again to validate program
     edge_program._graph_signature = _get_updated_graph_signature(
@@ -238,6 +314,285 @@ def capture_program(
     return edge_ep
 
 
+def _partition_graph_into_submodules(gm, subgm_tag, subgm_cb, ptn):
+    from torch.fx.passes.utils.fuser_utils import (
+        erase_nodes,
+        fuse_as_graphmodule,
+        insert_subgm,
+        legalize_graph,
+        topo_sort,
+    )
+
+    partitions = ptn.propose_partitions()
+    # insert meta for each partition group
+    for i, partition in enumerate(partitions):
+        for node in partition.nodes:
+            node.meta[subgm_tag] = i
+
+    for i in range(len(partitions)):
+        # find nodes with same group id in current graph
+        node_list = [
+            node for node in gm.graph.nodes if node.meta.get(subgm_tag, "") == i
+        ]
+        # fuse group nodes into submodule
+        sorted_nodes = topo_sort(node_list)
+        submodule_name = f"{subgm_tag}_{i}"
+        subgm, orig_inputs, orig_outputs = fuse_as_graphmodule(
+            gm, sorted_nodes, submodule_name
+        )
+        # insert submodule & trim group nodes
+        gm = insert_subgm(
+            gm,
+            subgm_cb(subgm, submodule_name),
+            orig_inputs,
+            orig_outputs,
+        )
+        erase_nodes(gm, sorted_nodes)
+        legalize_graph(gm)
+
+    gm.recompile()
+    return gm
+
+
+def _canonicalize_graph_with_lowered_module(gm, subgm_tag, ptn):
+    from executorch.exir.backend.backend_api import to_backend
+
+    # return lowered program for user to debug
+    exported_progs = []
+    # partition each submodule which went through convert_pt2e
+    for node in gm.graph.nodes:
+        if node.op == "call_module" and subgm_tag in node.name:
+            # obtain sample inputs through meta
+            subgm_input = [
+                torch.ones(arg.meta["val"].shape, dtype=arg.meta["val"].dtype)
+                for arg in node.args
+            ]
+            # program meets QNN backend requirement
+            sub_prog = capture_program(gm.get_submodule(node.name), tuple(subgm_input))
+            # start lowering with given partitioner
+            exported_progs.append(to_backend(sub_prog.exported_program, ptn))
+            # replace submodule with lowered module
+            gm.set_submodule(
+                node.name,
+                exported_progs[-1].graph_module,
+            )
+            # if node has multiple outputs, getitems will be default generated
+            if all(n.target != operator.getitem for n in node.users):
+                with gm.graph.inserting_after(node):
+                    getitem_node = gm.graph.call_function(
+                        operator.getitem,
+                        (node, 0),
+                    )
+                    getitem_node.meta = node.meta
+                    node.replace_all_uses_with(
+                        replace_with=getitem_node,
+                        delete_user_cb=lambda user: user.target != operator.getitem,
+                    )
+
+    gm.recompile()
+    return gm, exported_progs
+
+
+def skip_annotation(
+    nn_module: torch.nn.Module,
+    quantizer,
+    partitioner,
+    sample_input: Tuple[torch.Tensor, ...],
+    calibration_cb: Callable[[torch.fx.GraphModule], None],
+    fp_node_id_set: set = None,
+    fp_node_op_set: set = None,
+    fallback_to_cpu: bool = True,
+):
+    r"""
+    Exclude speific operators from quantizer annotation.
+    Skipped operators will defaultly stay in CPU, set 'fallback_to_cpu'
+    to False for trying to delegate them with FP16 precision.
+
+    e.g.: consider following graph:
+    bias_1 weight_1 input_1   bias_2 weight_2 input_2
+      | (placeholder) |         | (placeholder) |
+       \      |      /           \      |      /
+        \     |     /             \     |     /
+         \    |    /               \    |    /
+           conv2d_1                 conv2d_2
+           (torch.ops.aten.conv2d.default)
+               \                       /
+                \                     /
+                 \_______     _______/
+                         add_1
+             (torch.ops.aten.add.default)
+                           |
+                         output
+
+    If user wants to skip convolution op by names with
+    'skip_node_id_set' = {"conv2d_1"}
+    "bias_1 / weight_1 / input_1 / input_2 / conv2d_1"
+    will be partitioned out and not annotated / lowered with QNN.
+
+    [Generated graph]
+    bias_1 weight_1 input_1   input_2
+      | (placeholder) |          |
+       \      |      /           |
+        \     |     /            |
+         \    |    /             |
+           conv2d_1              |
+              \                 /
+               \               /
+                \             /
+               lowered_module_1
+            (QNN fixed precision)
+                      |
+                    output
+
+    If user wants to skip convolution op by target with
+    'skip_node_op_set' = {torch.ops.aten.conv2d.default}
+    "bias_1 / weight_1 / input_1 / conv2d_1,
+     bias_2 / weight_2 / input_2 / conv2d_2"
+    will be partitioned out and not annotated / lowered with QNN.
+
+    [Generated graph]
+    bias_1 weight_1 input_1   bias_2 weight_2 input_2
+      | (placeholder) |         | (placeholder) |
+       \      |      /           \      |      /
+        \     |     /             \     |     /
+         \    |    /               \    |    /
+           conv2d_1                 conv2d_2
+           (torch.ops.aten.conv2d.default)
+               \                       /
+                \                     /
+                 \__               __/
+                    lowered_module_1
+                 (QNN fixed precision)
+                           |
+                         output
+
+    If user wants to delegate the skipped conv2d from above graph
+    with 'fallback_to_cpu' = False:
+
+    [Generated graph]
+       input_1         input_2
+    (placeholder)   (placeholder)
+          |               |
+          \               /
+          lowered_module_2
+         (QNN fp16 precision)
+                  |
+                  |
+          lowered_module_1
+         (QNN fixed precision)
+                  |
+                output
+
+    Args:
+        nn_module (torch.nn.Module): The module to be lowered.
+        quantizer (QnnQuantizer): Instance of QnnQuantizer.
+        partitioner (QnnPartitioner): Instance of QnnPartitioner.
+        sample_input ((torch.Tensor, ...)): Sample input tensors for graph exporting.
+        calibration_cb (callable): Callback function for user-defined calibration.
+        fp_node_id_set ({str, ...}): Set of operator names to be left in fp precision.
+        fp_node_op_set ({torch.ops.aten.xxx, ...}): Set of operator targets to be left in fp precision.
+        fallback_to_cpu (bool): Whether to lower skipped nodes to fp16 or not.
+
+    Returns:
+        exported_programs: List of programs lowered to QnnBackend (quantized graphs only).
+    """
+    from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
+        QnnExecuTorchHtpPrecision,
+    )
+    from executorch.backends.qualcomm.serialization.qnn_compile_spec_serialize import (
+        convert_to_option,
+    )
+    from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+    from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+
+    def prepare_subgm(subgm, subgm_name):
+        # prepare current submodule for quantization annotation
+        subgm_prepared = prepare_pt2e(subgm, quantizer)
+        # overwrite this attribute or name will be set to "GraphModule"
+        # we could not identify each submodule if action is not performed
+        subgm_prepared.__class__.__name__ = subgm_name
+        return subgm_prepared
+
+    fp_node_id_set = fp_node_id_set if fp_node_id_set is not None else set()
+    fp_node_op_set = fp_node_op_set if fp_node_op_set is not None else set()
+    graph_module = torch.export.export(nn_module, sample_input).module()
+    # define node support type
+    capability_partitioner = CapabilityBasedPartitioner(
+        graph_module,
+        _AnnotationSkipper(fp_node_id_set, fp_node_op_set),
+        allows_single_node_partition=True,
+    )
+    subgm_tag = "annotated_group"
+    graph_module = _partition_graph_into_submodules(
+        gm=graph_module,
+        subgm_tag=subgm_tag,
+        subgm_cb=prepare_subgm,
+        ptn=capability_partitioner,
+    )
+    # perform calibration
+    calibration_cb(graph_module)
+    # convert sub modules which went through prepare_pt2e
+    for node in graph_module.graph.nodes:
+        if node.op == "call_module":
+            graph_module.set_submodule(
+                node.name, convert_pt2e(graph_module.get_submodule(node.name))
+            )
+    # canonicalize graph for lowering again
+    graph_module, exported_progs = _canonicalize_graph_with_lowered_module(
+        gm=graph_module,
+        subgm_tag=subgm_tag,
+        ptn=partitioner,
+    )
+
+    if not fallback_to_cpu:
+        try:
+            from executorch.exir.backend.partitioner import DelegationSpec
+
+            # change HTP compiler spec for hardware to enable fp16
+            qnn_option = generate_qnn_executorch_option(
+                partitioner.compiler_specs_snapshot
+            )
+            compile_option = convert_to_option(qnn_option)
+            htp_options = compile_option.backend_options.htp_options
+            htp_options.precision = QnnExecuTorchHtpPrecision.kHtpFp16
+            partitioner.delegation_spec = DelegationSpec(
+                "QnnBackend",
+                [
+                    CompileSpec(
+                        QCOM_QNN_COMPILE_SPEC, convert_to_flatbuffer(compile_option)
+                    )
+                ],
+            )
+        except:
+            print(
+                "Failed to change HTP compiler spec with 'use_fp16' as True,"
+                " skipped operators will fallback to cpu,"
+            )
+            return graph_module, exported_progs
+
+        # try lowering skipped operator into fp16
+        capability_partitioner = CapabilityBasedPartitioner(
+            graph_module,
+            _AnnotationSkipper(skip_annotated_submodule=True),
+            allows_single_node_partition=True,
+        )
+        subgm_tag = "skipped_group"
+        graph_module = _partition_graph_into_submodules(
+            gm=graph_module,
+            subgm_tag=subgm_tag,
+            subgm_cb=lambda subgm, _: subgm,
+            ptn=capability_partitioner,
+        )
+        graph_module, exported_progs_fp = _canonicalize_graph_with_lowered_module(
+            gm=graph_module,
+            subgm_tag=subgm_tag,
+            ptn=partitioner,
+        )
+        exported_progs.extend(exported_progs_fp)
+
+    return graph_module, exported_progs
+
+
 def from_context_binary(
     ctx_path: str, op_name: str, soc_model: QcomChipset = QcomChipset.SM8650
 ):
@@ -380,7 +735,7 @@ def generate_qnn_executorch_compiler_spec(
     debug: bool = False,
     saver: bool = False,
     online_prepare: bool = False,
-    tensor_dump_output_path: str = "",
+    dump_intermediate_outputs: bool = False,
     profile: bool = False,
     shared_buffer: bool = False,
     is_from_context_binary: bool = False,
@@ -402,10 +757,8 @@ def generate_qnn_executorch_compiler_spec(
         saver: Instead of compiling the model, run QNN Saver. Please check
             documents of Qualcomm AI Engine Direct SDK. This feature is usually
             for debugging purpose.
-        tensor_dump_output_path: If a path is given, Delegate would write
-            outputs of each OP there in runtime. In ALL cases,
-            we don't recommend to set this option. This option exist just
-            for debugging some accuracy issues.
+        dump_intermediate_outputs: If tensor dump is enabled, all intermediate tensors output will be dumped.
+            This option exists for debugging accuracy issues
         profile: Enable profile the performance of per operator.
             Note that for now only support kProfileDetailed to
             profile the performance of each operator with cycle unit.
@@ -423,6 +776,13 @@ def generate_qnn_executorch_compiler_spec(
     if soc_model not in _supported_soc_models:
         raise ValueError(f"unknown SoC model for QNN: {soc_model}")
 
+    if profile and dump_intermediate_outputs:
+        warnings.warn(
+            "It is not recommended to turn on both profiling and dump_intermediate_outputs the same time"
+            ", because dump_intermediate_outputs will cause performance drop.",
+            stacklevel=1,
+        )
+
     qnn_executorch_options = QnnExecuTorchOptions(
         _soc_info_table[soc_model], backend_options
     )
@@ -433,12 +793,11 @@ def generate_qnn_executorch_compiler_spec(
         else QnnExecuTorchLogLevel.kLogLevelWarn
     )
 
+    qnn_executorch_options.dump_intermediate_outputs = dump_intermediate_outputs
+
     if saver:
         qnn_executorch_options.library_path = "libQnnSaver.so"
 
-    if len(tensor_dump_output_path.strip()) != 0:
-        qnn_executorch_options.tensor_dump_output_path = tensor_dump_output_path
-
     if profile:
         qnn_executorch_options.profile_level = (
             QnnExecuTorchProfileLevel.kProfileDetailed
diff --git a/backends/transforms/TARGETS b/backends/transforms/TARGETS
index d461eb49788..df50e45f099 100644
--- a/backends/transforms/TARGETS
+++ b/backends/transforms/TARGETS
@@ -88,6 +88,20 @@ runtime.python_library(
     ],
 )
 
+runtime.python_library(
+    name = "view_copy_to_squeeze_unsqueeze",
+    srcs = ["view_copy_to_squeeze_unsqueeze.py"],
+    visibility = [
+        "//executorch/backends/...",
+    ],
+    deps = [
+        ":utils",
+        "//caffe2:torch",
+        "//executorch/exir:pass_base",
+        "//executorch/exir/dialects:lib",
+    ],
+)
+
 runtime.python_library(
     name = "fuse_view_copy",
     srcs = ["fuse_view_copy.py"],
diff --git a/backends/transforms/addmm_mm_to_linear.py b/backends/transforms/addmm_mm_to_linear.py
index 7855de617b7..358cbb7ac14 100644
--- a/backends/transforms/addmm_mm_to_linear.py
+++ b/backends/transforms/addmm_mm_to_linear.py
@@ -130,7 +130,7 @@ def replace_addmm_mm_with_linear(graph: torch.fx.Graph) -> torch.fx.Graph:
                         "call_function", ops.aten.linear.default, args
                     )
                     node.replace_all_uses_with(linear_node)
-                    output_val = linear_node.target(
+                    output_val = linear_node.target(  # pyre-fixme[29]
                         args[0].meta["val"], args[1].meta["val"], args[2].meta["val"]
                     )
                 else:
@@ -147,7 +147,7 @@ def replace_addmm_mm_with_linear(graph: torch.fx.Graph) -> torch.fx.Graph:
                         "call_function", ops.aten.linear.default, args
                     )
                     node.replace_all_uses_with(linear_node)
-                    output_val = linear_node.target(
+                    output_val = linear_node.target(  # pyre-fixme[29]
                         args[0].meta["val"], args[1].meta["val"]
                     )
                 linear_node.meta = node.meta
diff --git a/backends/transforms/decompose_sdpa.py b/backends/transforms/decompose_sdpa.py
index 6dbbf564f56..329dab96df2 100644
--- a/backends/transforms/decompose_sdpa.py
+++ b/backends/transforms/decompose_sdpa.py
@@ -34,7 +34,7 @@ def call(
                 # refer to pytorch/test/test_decomp.py
                 decomposed_module = make_fx(
                     node.target,
-                    decomposition_table=get_decompositions(
+                    decomposition_table=get_decompositions(  # pyre-fixme[6]
                         [
                             torch.ops.aten._scaled_dot_product_flash_attention_for_cpu.default,
                         ]
diff --git a/backends/transforms/view_copy_to_squeeze_unsqueeze.py b/backends/transforms/view_copy_to_squeeze_unsqueeze.py
new file mode 100644
index 00000000000..094ec6a3340
--- /dev/null
+++ b/backends/transforms/view_copy_to_squeeze_unsqueeze.py
@@ -0,0 +1,128 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from typing import List, Optional, Union
+
+import torch
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class ViewCopyToSqueezeUnsqueezePass(ExportPass):
+    """
+    Replaces view_copy nodes with squeeze_copy.dims nodes if the view node reduces dims of size 1.
+    Replaces view_copy nodes with unsqueeze_copy.default nodes if the view node adds a dim of size 1.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.view_copy_op: torch._ops.OpOverload = exir_ops.edge.aten.view_copy.default
+        self.squeeze_op: torch._ops.OpOverload = exir_ops.edge.aten.squeeze_copy.dims
+        self.unsqueeze_op: torch._ops.OpOverload = (
+            exir_ops.edge.aten.unsqueeze_copy.default
+        )
+
+    def is_node_target(
+        self, node: torch.fx.Node, target: torch._ops.OperatorBase
+    ) -> bool:
+        return node.op == "call_function" and node.target == target
+
+    def find_squeeze_dims(
+        self,
+        input_shape: List[int],
+        view_shape: List[int],
+    ) -> Optional[List[int]]:
+        # view_shape should be a subset of input_shape
+        if len(input_shape) <= len(view_shape):
+            return None
+
+        # check that all dims are equal except the removed dims
+        i = 0
+        j = 0
+        idx = []
+        while i < len(input_shape):
+            if input_shape[i] != view_shape[j]:
+                if input_shape[i] == 1:
+                    idx.append(i)
+                    j -= 1
+                    # continue to check remaining dims are equal
+                else:
+                    return None
+            i += 1
+            j += 1
+        return idx
+
+    def find_unsqueeze_dim(
+        self,
+        input_shape: List[int],
+        view_shape: List[int],
+    ) -> Optional[int]:
+        # unsqueeze should increase the length of input_shape by 1
+        if len(view_shape) - len(input_shape) != 1:
+            return None
+
+        # check that all dims are equal except the added dim
+        i = 0
+        j = 0
+        idx = -1
+        while j < len(view_shape):
+            if input_shape[i] != view_shape[j]:
+                if view_shape[j] == 1:
+                    idx = j
+                    i -= 1
+                    # continue to check remaining dims are equal
+                else:
+                    return None
+            i += 1
+            j += 1
+        return idx
+
+    def replace_view_copy_node(
+        self,
+        graph_module: torch.fx.GraphModule,
+        view_node: torch.fx.Node,
+        op: torch._ops.OpOverload,
+        arg: Union[List[int], int],
+    ) -> None:
+        with graph_module.graph.inserting_before(view_node):
+            new_node = graph_module.graph.create_node(
+                "call_function",
+                op,
+                (view_node.args[0], arg),
+            )
+            new_node.meta = view_node.meta
+            view_node.replace_all_uses_with(new_node)
+            graph_module.graph.erase_node(view_node)
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        modified = False
+        for node in graph_module.graph.nodes:
+            if self.is_node_target(node, self.view_copy_op):
+                input_node = node.args[0]
+                input_shape = input_node.meta["val"].shape
+                view_shape = node.args[1]
+                squeeze_dims = self.find_squeeze_dims(input_shape, view_shape)
+                if squeeze_dims:
+                    self.replace_view_copy_node(
+                        graph_module, node, self.squeeze_op, squeeze_dims
+                    )
+                    modified = True
+                    continue
+                unsqueeze_dim = self.find_unsqueeze_dim(input_shape, view_shape)
+                if unsqueeze_dim:
+                    self.replace_view_copy_node(
+                        graph_module, node, self.unsqueeze_op, unsqueeze_dim
+                    )
+                    modified = True
+                    continue
+
+        if modified:
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, modified)
diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake
index 49dc27056a0..b44736d20dd 100644
--- a/backends/vulkan/cmake/ShaderLibrary.cmake
+++ b/backends/vulkan/cmake/ShaderLibrary.cmake
@@ -50,8 +50,8 @@ function(gen_vulkan_shader_lib_cpp shaders_path)
   execute_process(
     COMMAND
       "${PYTHON_EXECUTABLE}"
-      ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py
-      --glsl-path ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH}
+      ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py --glsl-path
+      ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH}
       --glslc-path=${GLSLC_PATH} --tmp-dir-path=${VULKAN_SHADERGEN_OUT_PATH}
       --env ${VULKAN_GEN_ARG_ENV}
     RESULT_VARIABLE error_code
diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md
index aaff7a7a727..8570859ed34 100644
--- a/backends/vulkan/docs/android_demo.md
+++ b/backends/vulkan/docs/android_demo.md
@@ -94,8 +94,9 @@ binary using the Android NDK toolchain.
   cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI=$ANDROID_ABI \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_VULKAN=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DPYTHON_EXECUTABLE=python \
diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py
index 08d7f96a6b9..ca7ce72caed 100644
--- a/backends/vulkan/partitioner/supported_ops.py
+++ b/backends/vulkan/partitioner/supported_ops.py
@@ -8,7 +8,10 @@
 
 import operator
 
-from executorch.backends.vulkan.passes.custom_ops_defs import grid_priors_op  # noqa
+from executorch.backends.vulkan.passes.custom_ops_defs import (  # noqa
+    conv_with_clamp_op,
+    grid_priors_op,
+)
 
 from executorch.exir.dialects._ops import ops as exir_ops
 
@@ -84,6 +87,7 @@ def __contains__(self, op):
 
 CONVOLUTION_OPS = [
     exir_ops.edge.aten.convolution.default,
+    exir_ops.edge.et_vk.conv_with_clamp.default,
 ]
 
 REDUCTION_OPS = [
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
index 4d24877b631..103297bc758 100644
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -38,6 +38,9 @@
     torch.ops.aten.upsample_nearest2d.vec,
 ]
 
+logger: logging.Logger = logging.getLogger("")
+logger.setLevel(logging.INFO)
+
 
 class VulkanSupportedOperators(OperatorSupportBase):
     _ops: OpList = enumerate_supported_ops()
@@ -110,7 +113,7 @@ def is_node_supported(
     ) -> bool:
         r = self._is_node_supported(submodules, node)
         if not r and node.op == "call_function":
-            logging.info(f"Skipping node in Vulkan partitioning: {node.format_node()}")
+            logger.info(f"Skipping node in Vulkan partitioning: {node.format_node()}")
         return r
 
     def _is_node_supported(
@@ -179,9 +182,9 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
 
         pl = len(partition_list)
         if pl == 0:
-            logging.warning("No Vulkan subgraphs can be partitioned!")
+            logger.warning("No Vulkan subgraphs can be partitioned!")
         else:
-            logging.info(f"Found {pl} Vulkan subgraphs to be partitioned.")
+            logger.info(f"Found {pl} Vulkan subgraphs to be partitioned.")
 
         tag_constant_data(exported_program)
 
diff --git a/backends/vulkan/passes/custom_ops_defs.py b/backends/vulkan/passes/custom_ops_defs.py
index 62f21bfee63..fd586b665a0 100644
--- a/backends/vulkan/passes/custom_ops_defs.py
+++ b/backends/vulkan/passes/custom_ops_defs.py
@@ -48,6 +48,43 @@ def conv_with_clamp_impl(
 conv_with_clamp_op = getattr(getattr(torch.ops, namespace), name)
 
 
+def conv_with_clamp_out_impl(
+    input,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    transposed=False,
+    output_padding=0,
+    groups=1,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    out=None,
+):
+    out = conv_with_clamp_impl(
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        output_min,
+        output_max,
+    )
+    return out
+
+
+name = "conv_with_clamp.out"
+lib.define(
+    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, conv_with_clamp_out_impl, "CompositeExplicitAutograd")
+
+
 # The dimension of x should be larger than 1
 def grid_priors_impl(
     x,
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index fd06841beca..7ed9469f77f 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -412,7 +412,7 @@ void maybe_resize_output(
 // VulkanBackend class
 //
 
-class VulkanBackend final : public PyTorchBackendInterface {
+class VulkanBackend final : public ::executorch::runtime::BackendInterface {
  public:
   ~VulkanBackend() override = default;
 
diff --git a/backends/vulkan/runtime/api/api.h b/backends/vulkan/runtime/api/api.h
index de77c57fb0e..0f496a4af8a 100644
--- a/backends/vulkan/runtime/api/api.h
+++ b/backends/vulkan/runtime/api/api.h
@@ -12,7 +12,7 @@
 #include <executorch/backends/vulkan/runtime/api/ShaderRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/api/containers/ParamsBuffer.h>
-#include <executorch/backends/vulkan/runtime/api/containers/StorageBuffer.h>
+#include <executorch/backends/vulkan/runtime/api/containers/StagingBuffer.h>
 #include <executorch/backends/vulkan/runtime/api/containers/Tensor.h>
 
 #include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
new file mode 100644
index 00000000000..6f67ae8a64a
--- /dev/null
+++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
+
+#include <executorch/backends/vulkan/runtime/api/Context.h>
+
+#include <executorch/backends/vulkan/runtime/vk_api/memory/Buffer.h>
+
+#include <cstring>
+
+namespace vkcompute {
+namespace api {
+
+class StagingBuffer final {
+ private:
+  Context* context_p_;
+  vkapi::ScalarType dtype_;
+  size_t numel_;
+  size_t nbytes_;
+  vkapi::VulkanBuffer vulkan_buffer_;
+
+  void* mapped_data_;
+
+ public:
+  StagingBuffer(
+      Context* context_p,
+      const vkapi::ScalarType dtype,
+      const size_t numel)
+      : context_p_(context_p),
+        dtype_(dtype),
+        numel_(numel),
+        nbytes_(element_size(dtype_) * numel_),
+        vulkan_buffer_(
+            context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)),
+        mapped_data_(nullptr) {}
+
+  StagingBuffer(const StagingBuffer&) = delete;
+  StagingBuffer& operator=(const StagingBuffer&) = delete;
+
+  StagingBuffer(StagingBuffer&&) = default;
+  StagingBuffer& operator=(StagingBuffer&&) = default;
+
+  ~StagingBuffer() {
+    context_p_->register_buffer_cleanup(vulkan_buffer_);
+  }
+
+  inline vkapi::ScalarType dtype() {
+    return dtype_;
+  }
+
+  inline vkapi::VulkanBuffer& buffer() {
+    return vulkan_buffer_;
+  }
+
+  inline void* data() {
+    if (!mapped_data_) {
+      mapped_data_ = vulkan_buffer_.allocation_info().pMappedData;
+    }
+    return mapped_data_;
+  }
+
+  inline size_t numel() {
+    return numel_;
+  }
+
+  inline size_t nbytes() {
+    return nbytes_;
+  }
+
+  inline void copy_from(const void* src, const size_t nbytes) {
+    VK_CHECK_COND(nbytes <= nbytes_);
+    memcpy(data(), src, nbytes);
+    vmaFlushAllocation(
+        vulkan_buffer_.vma_allocator(),
+        vulkan_buffer_.allocation(),
+        0u,
+        VK_WHOLE_SIZE);
+  }
+
+  inline void copy_to(void* dst, const size_t nbytes) {
+    VK_CHECK_COND(nbytes <= nbytes_);
+    vmaInvalidateAllocation(
+        vulkan_buffer_.vma_allocator(),
+        vulkan_buffer_.allocation(),
+        0u,
+        VK_WHOLE_SIZE);
+    memcpy(dst, data(), nbytes);
+  }
+
+  inline void set_staging_zeros() {
+    memset(data(), 0, nbytes_);
+  }
+};
+
+} // namespace api
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/containers/StorageBuffer.h b/backends/vulkan/runtime/api/containers/StorageBuffer.h
deleted file mode 100644
index 17c34706057..00000000000
--- a/backends/vulkan/runtime/api/containers/StorageBuffer.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/api/Context.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Buffer.h>
-
-namespace vkcompute {
-namespace api {
-
-class StorageBuffer final {
- private:
-  Context* context_p_;
-  vkapi::ScalarType dtype_;
-  size_t numel_;
-  size_t nbytes_;
-  vkapi::VulkanBuffer vulkan_buffer_;
-
- public:
-  StorageBuffer(
-      Context* context_p,
-      const vkapi::ScalarType dtype,
-      const size_t numel,
-      const bool gpuonly = false)
-      : context_p_(context_p),
-        dtype_(dtype),
-        numel_(numel),
-        nbytes_(element_size(dtype_) * numel_),
-        vulkan_buffer_(context_p_->adapter_ptr()->vma().create_storage_buffer(
-            nbytes_,
-            gpuonly)) {}
-
-  StorageBuffer(const StorageBuffer&) = delete;
-  StorageBuffer& operator=(const StorageBuffer&) = delete;
-
-  StorageBuffer(StorageBuffer&&) = default;
-  StorageBuffer& operator=(StorageBuffer&&) = default;
-
-  ~StorageBuffer() {
-    context_p_->register_buffer_cleanup(vulkan_buffer_);
-  }
-
-  inline vkapi::ScalarType dtype() {
-    return dtype_;
-  }
-
-  inline vkapi::VulkanBuffer& buffer() {
-    return vulkan_buffer_;
-  }
-
-  inline size_t numel() {
-    return numel_;
-  }
-
-  inline size_t nbytes() {
-    return nbytes_;
-  }
-};
-
-} // namespace api
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 5e67b689735..498ea37f3be 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -13,41 +13,24 @@
 namespace vkcompute {
 namespace api {
 
-/*
- * Given the strides of a buffer-backed tensor, estimate the equivalent memory
- * layout enum value by identifying the fastest moving dimension.
- */
-utils::GPUMemoryLayout estimate_memory_layout(
-    const std::vector<int64_t>& dim_order) {
-  int64_t fastest_dim_whcn = dim_order.size() - 1 - dim_order.back();
-  if (fastest_dim_whcn >= 0 && fastest_dim_whcn <= 3) {
-    return utils::GPUMemoryLayout(fastest_dim_whcn);
-  }
-
-  // TODO(ssjia) find a way to gracefully recover from this case by i.e. adding
-  // a UNKOWN GPUMemoryLayout. This is not high priority though because we don't
-  // expect this to ever come up in practice.
-  VK_THROW("No compatible GPUMemoryLayout value");
-}
-
 std::vector<int64_t> calculate_dim_order(
     const size_t ndim,
-    const utils::GPUMemoryLayout memory_layout) {
+    const int32_t packed_dim) {
   // Special case for zero dim tensors
   if (ndim == 0) {
     return {0};
   }
   std::vector<int64_t> dim_order(ndim);
-  int64_t last_dim =
-      ndim - utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
+  // Explicitly convert ndim to signed to prevent underflow
+  int64_t last_dim = int64_t(ndim) - 1 - packed_dim;
 
   int64_t cur_dim = 0;
   for (int d = 0; d < ndim; ++d) {
     if (d == last_dim) {
-      cur_dim += 1;
+      cur_dim++;
     }
     dim_order[d] = cur_dim;
-    cur_dim += 1;
+    cur_dim++;
   }
   if (last_dim >= 0) {
     dim_order[ndim - 1] = last_dim;
@@ -56,44 +39,6 @@ std::vector<int64_t> calculate_dim_order(
   return dim_order;
 }
 
-namespace {
-
-struct StrideDimIndexPair {
-  int64_t stride;
-  int64_t dim_i;
-
-  StrideDimIndexPair() : stride(0), dim_i(0) {}
-
-  explicit StrideDimIndexPair(int64_t stride, int64_t dim_i)
-      : stride(stride), dim_i(dim_i) {}
-
-  bool operator>(const StrideDimIndexPair& other) const {
-    // Descending order
-    return stride < other.stride;
-  }
-
-  bool operator<(const StrideDimIndexPair& other) const {
-    // Descending order
-    return stride > other.stride;
-  }
-};
-
-} // namespace
-
-std::vector<int64_t> strides_to_dim_order(const std::vector<int64_t>& strides) {
-  std::vector<StrideDimIndexPair> stride_dim_pairs(strides.size());
-  for (size_t i = 0; i < strides.size(); ++i) {
-    stride_dim_pairs[i] = StrideDimIndexPair(strides[i], i);
-  }
-  std::stable_sort(stride_dim_pairs.begin(), stride_dim_pairs.end());
-
-  std::vector<int64_t> dim_order(strides.size());
-  for (int i = 0; i < strides.size(); ++i) {
-    dim_order.at(i) = stride_dim_pairs.at(i).dim_i;
-  }
-  return dim_order;
-}
-
 std::vector<int64_t> calculate_strides(
     const std::vector<int64_t>& sizes,
     const std::vector<int64_t>& dim_order) {
@@ -118,6 +63,42 @@ std::vector<int64_t> calculate_strides(
   return strides;
 }
 
+/*
+ * Axis mapping is somewhat analogous to strides for texture backed tensors.
+ *
+ * The axis mapping is normalized to 4 dimensions, similar to the padded sizes.
+ * The first 3 values of the axis mapping indicate the (X,Y,Z) image texture
+ * axis that corresponds to the width, height, and channels dimension of the
+ * tensor. Thus the axis mapping can be considered to be in WHCN dimension
+ * order.
+ *
+ * The last value `axis_map.at(3)` indicates the WHCN index of the tensor
+ * dimension along which batches will be concatenated. This dimension can be
+ * referred to as the "inner dimension" To determine which image texture axis is
+ * used for the concatenation, a double lookup will need to be performed
+ * (axis_map.at(axis_map.at(3))).
+ *
+ * The reason for strucuring axis mapping this way is because for the batch dim,
+ * two things need to be easily derived:
+ *
+ * 1. The dim idx of the inner dimension, so that the size of the inner
+ *    dimension can be easily determined.
+ * 2. The texture axis used to concatenate batches
+ *
+ * By storing the dim index of the inner dimension instead of the texture axis
+ * it maps to, both pieces of information are readily available.
+ *
+ * The axis mapping allows for permuted views of texture-backed tensors.
+ */
+std::vector<int64_t> default_axis_map() {
+  // Currently, all compute shaders have an assumption that the channels dim is
+  // used to combine with the batch dim of a tensor. However, once dim mapping
+  // is integrated into the tensor indexing logic for each compute shader, we
+  // can be more flexible with mapping the batch dim to different texture axes
+  // in order to improve performance or memory footprint.
+  return {0, 1, 2, 2};
+}
+
 bool dim_order_is_valid(const std::vector<int64_t>& dim_order) {
   int64_t sum = 0;
   for (size_t i = 0; i < dim_order.size(); ++i) {
@@ -151,7 +132,7 @@ std::vector<int64_t> unsqueeze_strides(
 
 std::vector<int64_t> calculate_padded_sizes(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout) {
+    const int32_t packed_dim) {
   int64_t ndim = sizes.size();
   if (ndim == 0) {
     ndim = 1;
@@ -165,8 +146,7 @@ std::vector<int64_t> calculate_padded_sizes(
   }
 
   // Pad the packed dim to the next multiple of 4.
-  const int64_t dim_offset =
-      utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
+  const int64_t dim_offset = packed_dim + 1;
   const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
   padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size);
 
@@ -175,30 +155,214 @@ std::vector<int64_t> calculate_padded_sizes(
 
 utils::uvec3 calculate_image_extents(
     const std::vector<int64_t>& padded_sizes,
-    const utils::GPUMemoryLayout memory_layout) {
+    const std::vector<int64_t>& axis_map,
+    const int32_t packed_dim) {
   VK_CHECK_COND(padded_sizes.size() == 4);
+  VK_CHECK_COND(axis_map.size() == 4);
+
+  utils::uvec3 extents({1, 1, 1});
+  // First three elements of axis_map indicate which (X,Y,Z) image axis the
+  // width, height, and channels dim of the tensor maps to.
+  for (int whcn_dim = 0; whcn_dim < 3; ++whcn_dim) {
+    const int64_t axis = axis_map.at(whcn_dim);
+    const int64_t dim = padded_sizes.size() - 1 - whcn_dim;
+    extents[axis] = utils::safe_downcast<uint32_t>(padded_sizes.at(dim));
+  }
+
+  // axis_map[3] indicates the WHCN index of the dimension used for batch
+  // concatenation. Thus a double lookup is required to determine the image axis
+  // used for batch concatenation.
+  const int64_t concatted_whcn_dim = axis_map.at(3);
+  const int64_t batch_axis = axis_map.at(concatted_whcn_dim);
+  // Multiply the extents of the batch axis by the batch size.
+  extents[batch_axis] *= padded_sizes.at(0);
+
+  VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0);
+  extents[axis_map.at(packed_dim)] /= 4;
+  return extents;
+}
+
+//
+// vTensorStorage
+//
 
-  uint32_t N = utils::safe_downcast<uint32_t>(padded_sizes.at(0));
-  uint32_t C = utils::safe_downcast<uint32_t>(padded_sizes.at(1));
-  uint32_t H = utils::safe_downcast<uint32_t>(padded_sizes.at(2));
-  uint32_t W = utils::safe_downcast<uint32_t>(padded_sizes.at(3));
+vkapi::VulkanImage allocate_image(
+    Context* const context_ptr,
+    utils::uvec3& image_extents,
+    const utils::StorageType storage_type,
+    const VkFormat image_format,
+    const bool allocate_memory) {
+  vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();
 
-  switch (memory_layout) {
-    case utils::kWidthPacked:
-      VK_CHECK_COND(W % 4 == 0);
-      W /= 4;
+  vkapi::ImageSampler::Properties sampler_props{
+      VK_FILTER_NEAREST,
+      VK_SAMPLER_MIPMAP_MODE_NEAREST,
+      VK_SAMPLER_ADDRESS_MODE_REPEAT,
+      VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
+  };
+
+  VkImageType image_type = VK_IMAGE_TYPE_3D;
+  VkImageViewType image_view_type;
+
+  switch (storage_type) {
+    case utils::kTexture3D:
+      image_type = VK_IMAGE_TYPE_3D;
+      image_view_type = VK_IMAGE_VIEW_TYPE_3D;
       break;
-    case utils::kHeightPacked:
-      VK_CHECK_COND(H % 4 == 0);
-      H /= 4;
+    case utils::kTexture2D:
+      image_type = VK_IMAGE_TYPE_2D;
+      image_view_type = VK_IMAGE_VIEW_TYPE_2D;
       break;
-    case utils::kChannelsPacked:
-      VK_CHECK_COND(C % 4 == 0);
-      C /= 4;
+    default:
+      // Return an empty VulkanImage by default
+      return vkapi::VulkanImage();
+  }
+
+  VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props);
+
+  return adapter_ptr->vma().create_image(
+      vkapi::create_extent3d(image_extents),
+      image_format,
+      image_type,
+      image_view_type,
+      sampler_props,
+      sampler,
+      /*allow_transfer = */ true,
+      /*allocate_memory = */ allocate_memory);
+}
+
+vkapi::VulkanBuffer allocate_buffer(
+    Context* const context_ptr,
+    const int64_t numel,
+    const utils::StorageType storage_type,
+    const vkapi::ScalarType dtype,
+    const bool allocate_memory) {
+  vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();
+
+  switch (storage_type) {
+    case utils::kBuffer:
       break;
+    default:
+      // Return an empty VulkanBuffer if Buffer storage is not used
+      return vkapi::VulkanBuffer();
+  }
+
+  return adapter_ptr->vma().create_storage_buffer(
+      element_size(dtype) * numel, allocate_memory);
+}
+
+vTensorStorage::vTensorStorage(
+    Context* const context,
+    const utils::StorageType storage_type,
+    const std::vector<int64_t>& axis_map,
+    const int32_t packed_dim,
+    const std::vector<int64_t>& padded_sizes,
+    const vkapi::ScalarType dtype,
+    const bool allocate_memory)
+    : context_(context),
+      storage_type_{storage_type},
+      image_extents_(
+          calculate_image_extents(padded_sizes, axis_map, packed_dim)),
+      buffer_length_{utils::multiply_integers(padded_sizes)},
+      buffer_offset_{0},
+      image_(allocate_image(
+          context_,
+          image_extents_,
+          storage_type_,
+          to_vkformat(dtype),
+          allocate_memory)),
+      buffer_(allocate_buffer(
+          context_,
+          buffer_length_,
+          storage_type_,
+          dtype,
+          allocate_memory)),
+      last_access_{} {}
+
+vTensorStorage::vTensorStorage(
+    const vTensorStorage& other,
+    const int64_t buffer_offset)
+    : context_(other.context_),
+      storage_type_{other.storage_type_},
+      image_extents_(other.image_extents_),
+      buffer_length_{other.buffer_length_},
+      buffer_offset_{buffer_offset},
+      image_(other.image_),
+      buffer_(other.buffer_, buffer_offset),
+      last_access_{other.last_access_} {}
+
+vTensorStorage::~vTensorStorage() {
+  flush();
+}
+
+void vTensorStorage::flush() {
+  if (image_) {
+    context_->register_image_cleanup(image_);
+  } else if (buffer_) {
+    context_->register_buffer_cleanup(buffer_);
   }
+  last_access_ = {};
+}
 
-  return {W, H, C * N};
+void vTensorStorage::transition(
+    vkapi::PipelineBarrier& pipeline_barrier,
+    const vkapi::PipelineStageFlags cur_stage,
+    const vkapi::MemoryAccessFlags cur_access) {
+  // Get last stage access
+  vkapi::PipelineStageFlags prev_stage = last_access_.stage;
+  vkapi::MemoryAccessFlags prev_access = last_access_.access;
+
+  const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0;
+
+  VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+  VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+  bool layout_changed = false;
+  if (image_) {
+    cur_layout = image_.layout();
+    new_layout = vkapi::vk_layout(cur_stage, cur_access);
+
+    layout_changed = cur_layout != new_layout;
+  }
+
+  if (prev_written || layout_changed) {
+    VkPipelineStageFlags src_stage = vkapi::vk_stage(prev_stage);
+    if (0u == src_stage) {
+      src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+    }
+    VkPipelineStageFlags dst_stage = vkapi::vk_stage(cur_stage);
+    if (0u == dst_stage) {
+      dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+    }
+
+    pipeline_barrier.stage.src |= src_stage;
+    pipeline_barrier.stage.dst |= dst_stage;
+
+    if (image_) {
+      pipeline_barrier.images.emplace_back(
+          vkapi::vk_access(prev_stage, prev_access),
+          vkapi::vk_access(cur_stage, cur_access),
+          cur_layout,
+          new_layout,
+          image_);
+
+      image_.set_layout(new_layout);
+    } else if (buffer_) {
+      pipeline_barrier.buffers.emplace_back(
+          vkapi::vk_access(prev_stage, prev_access),
+          vkapi::vk_access(cur_stage, cur_access),
+          buffer_);
+    }
+  }
+
+  last_access_.stage = cur_stage;
+  last_access_.access = cur_access;
+}
+
+bool vTensorStorage::is_copy_of(const vTensorStorage& other) const {
+  if (storage_type_ == utils::kBuffer) {
+    return buffer_.is_copy_of(other.buffer_);
+  }
+  return image_.is_copy_of(other.image_);
 }
 
 //
@@ -213,26 +377,29 @@ vTensor::vTensor(
     const utils::GPUMemoryLayout memory_layout,
     const bool allocate_memory)
     : dtype_(dtype),
-      memory_layout_(memory_layout),
-      // Calculate tensor size metadata
+      // Calculate tensor metadata
       sizes_(sizes.begin(), sizes.end()),
-      dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)),
+      packed_dim_(utils::to_packed_dim<int32_t>(memory_layout)),
+      dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)),
+      axis_map_(default_axis_map()),
       strides_(calculate_strides(sizes, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
-      padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
+      padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
       padded_numel_(utils::multiply_integers(padded_sizes_)),
-      texture_limits_{{0, 0, 0}},
+      logical_limits_{{0, 0, 0}},
       // Utility Uniform Buffers that can be passed to shaders as arguments
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
-      texture_limits_uniform_(),
+      axis_map_uniform_(),
+      logical_limits_uniform_(),
       // Construct Tensor storage
       storage_(
           context,
           storage_type,
-          memory_layout_,
+          axis_map_,
+          packed_dim_,
           padded_sizes_,
           dtype_,
           allocate_memory) {
@@ -240,10 +407,7 @@ vTensor::vTensor(
       dim_order_is_valid(dim_order_), "computed dim order is invalid");
 
   if (storage_type != utils::kBuffer) {
-    texture_limits_.limits = utils::ivec3{
-        utils::safe_downcast<int32_t>(storage_.image_extents_[0]),
-        utils::safe_downcast<int32_t>(storage_.image_extents_[1]),
-        utils::safe_downcast<int32_t>(storage_.image_extents_[2])};
+    set_logical_limits(storage_.image_extents_);
   }
 
   if (dtype == vkapi::kHalf) {
@@ -256,10 +420,11 @@ vTensor::vTensor(
 
 vTensor::vTensor(const vTensor& other)
     : dtype_(other.dtype_),
-      memory_layout_(other.memory_layout_),
       // Copy tensor size metadata
       sizes_(other.sizes_.begin(), other.sizes_.end()),
+      packed_dim_{other.packed_dim_},
       dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
+      axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
       strides_(other.strides_.begin(), other.strides_.end()),
       numel_(other.numel_),
       padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
@@ -267,12 +432,13 @@ vTensor::vTensor(const vTensor& other)
           other.unsqueezed_strides_.begin(),
           other.unsqueezed_strides_.end()},
       padded_numel_(other.padded_numel_),
-      texture_limits_{other.texture_limits_},
+      logical_limits_{other.logical_limits_},
       // Empty initialize Utility Uniform Buffers
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
-      texture_limits_uniform_(),
+      axis_map_uniform_(),
+      logical_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_) {}
 
@@ -282,21 +448,23 @@ vTensor::vTensor(
     const std::vector<int64_t>& dim_order,
     const int64_t offset_numel)
     : dtype_(other.dtype_),
-      memory_layout_(estimate_memory_layout(dim_order)),
       // Copy tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
+      packed_dim_(other.packed_dim_),
       dim_order_(dim_order.begin(), dim_order.end()),
+      axis_map_(default_axis_map()),
       strides_(calculate_strides(sizes_, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
-      padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
+      padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
       padded_numel_(utils::multiply_integers(padded_sizes_)),
-      texture_limits_{{0, 0, 0}},
+      logical_limits_(other.logical_limits_),
       // Empty initialize Utility Uniform Buffers
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
-      texture_limits_uniform_(),
+      axis_map_uniform_(),
+      logical_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
   VK_CHECK_COND(
@@ -337,6 +505,25 @@ vkapi::VulkanBuffer& vTensor::buffer(
   return storage_.buffer_;
 }
 
+void vTensor::set_logical_limits(const utils::uvec3& image_extents) {
+  logical_limits_.limits[0] = image_extents[axis_map_.at(0)];
+  logical_limits_.limits[1] = image_extents[axis_map_.at(1)];
+  logical_limits_.limits[2] = image_extents[axis_map_.at(2)];
+}
+
+utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
+  switch (packed_dim_) {
+    case WHCN::kWidthDim:
+      return utils::kWidthPacked;
+    case WHCN::kHeightDim:
+      return utils::kHeightPacked;
+    case WHCN::kChannelsDim:
+      return utils::kChannelsPacked;
+    default:
+      VK_THROW("Invalid packed dim");
+  }
+}
+
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
   if (!sizes_uniform_.buffer()) {
     sizes_uniform_ =
@@ -353,11 +540,19 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() {
   return vkapi::BufferBindInfo(strides_uniform_.buffer());
 }
 
-const vkapi::BufferBindInfo vTensor::texture_limits_ubo() {
-  if (!texture_limits_uniform_.buffer()) {
-    texture_limits_uniform_ = ParamsBuffer(storage_.context_, texture_limits_);
+const vkapi::BufferBindInfo vTensor::axis_map_ubo() {
+  if (!axis_map_uniform_.buffer()) {
+    axis_map_uniform_ =
+        ParamsBuffer(storage_.context_, utils::make_ivec4(axis_map_));
   }
-  return vkapi::BufferBindInfo(texture_limits_uniform_.buffer());
+  return vkapi::BufferBindInfo(axis_map_uniform_.buffer());
+}
+
+const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
+  if (!logical_limits_uniform_.buffer()) {
+    logical_limits_uniform_ = ParamsBuffer(storage_.context_, logical_limits_);
+  }
+  return vkapi::BufferBindInfo(logical_limits_uniform_.buffer());
 }
 
 const vkapi::BufferBindInfo vTensor::numel_ubo() {
@@ -380,17 +575,6 @@ size_t vTensor::staging_buffer_numel() const {
   return padded_numel_;
 }
 
-VmaAllocationCreateInfo vTensor::get_allocation_create_info() const {
-  switch (storage_type()) {
-    case utils::kBuffer:
-      return storage_.buffer_.allocation_create_info();
-    case utils::kTexture2D:
-    case utils::kTexture3D:
-      return storage_.image_.allocation_create_info();
-  }
-  return {};
-}
-
 VkMemoryRequirements vTensor::get_memory_requirements() const {
   switch (storage_type()) {
     case utils::kBuffer:
@@ -414,51 +598,34 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
   }
 }
 
-void vTensor::update_metadata(
-    const std::vector<int64_t>& new_sizes,
-    const std::vector<int64_t>& new_dim_order) {
-  sizes_ = new_sizes;
-  dim_order_ = new_dim_order;
+void vTensor::update_metadata() {
   strides_ = calculate_strides(sizes_, dim_order_);
-  // Only update the memory layout for buffer-backed tensors. Strides are
-  // meaningless for texture-backed tensors and do not impact the memory layout.
-  if (storage_type() == utils::kBuffer) {
-    memory_layout_ = estimate_memory_layout(dim_order_);
-  }
   numel_ = utils::multiply_integers(sizes_);
 
-  padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_);
+  padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_);
   unsqueezed_strides_ = unsqueeze_strides(strides_, numel_);
   padded_numel_ = utils::multiply_integers(padded_sizes_);
 
-  // Calculate the extents of the image texture that would have been required
-  // for a tensor of the new sizes.
-  utils::uvec3 virtual_extents =
-      calculate_image_extents(padded_sizes_, memory_layout_);
-
-  // Update the texture limits to reflect the new virtual extents.
-  texture_limits_.limits = utils::ivec3{
-      utils::safe_downcast<int32_t>(virtual_extents[0]),
-      utils::safe_downcast<int32_t>(virtual_extents[1]),
-      utils::safe_downcast<int32_t>(virtual_extents[2])};
+  // Calculate the image extents that would have been used to allocate a texture
+  // withthe current sizes, and use that to set the logical limits.
+  set_logical_limits(
+      calculate_image_extents(padded_sizes_, axis_map_, packed_dim_));
 
   if (sizes_uniform_.buffer()) {
     sizes_uniform_.update(utils::make_whcn_ivec4(sizes_));
   }
-  if (texture_limits_uniform_.buffer()) {
-    texture_limits_uniform_.update(texture_limits_);
-  }
   if (strides_uniform_.buffer()) {
     strides_uniform_.update(utils::make_whcn_ivec4(unsqueezed_strides_));
   }
   if (numel_uniform_.buffer()) {
     numel_uniform_.update(numel_);
   }
-}
-
-void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
-  // Dim order does not change on resize
-  update_metadata(new_sizes, dim_order_);
+  if (axis_map_uniform_.buffer()) {
+    axis_map_uniform_.update(utils::make_ivec4(axis_map_));
+  }
+  if (logical_limits_uniform_.buffer()) {
+    logical_limits_uniform_.update(logical_limits_);
+  }
 }
 
 void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
@@ -466,16 +633,20 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
     utils::uvec3 virtual_extents =
-        calculate_image_extents(padded_sizes_, memory_layout_);
+        calculate_image_extents(padded_sizes_, axis_map_, packed_dim_);
 
-    bool valid_resize = virtual_extents[0] <= image_extents()[0];
-    valid_resize = valid_resize && virtual_extents[1] <= image_extents()[1];
-    valid_resize = valid_resize && virtual_extents[2] <= image_extents()[2];
+    bool valid_resize = virtual_extents[0] <= storage_.image_extents_[0];
+    valid_resize =
+        valid_resize && virtual_extents[1] <= storage_.image_extents_[1];
+    valid_resize =
+        valid_resize && virtual_extents[2] <= storage_.image_extents_[2];
 
     VK_CHECK_COND(
         valid_resize,
         "tensor sizes requires a larger texture than the current one.");
   } else {
+    // For buffer storage check that the current buffer is large enough for the
+    // new sizes of the tensor.
     int64_t numel = utils::multiply_integers(sizes);
     bool valid_resize =
         numel + storage_.buffer_offset_ <= storage_.buffer_length_;
@@ -489,232 +660,72 @@ void vTensor::virtual_reconfigure(
     const std::vector<int64_t>& new_sizes,
     const std::vector<int64_t>& new_dim_order) {
   VK_CHECK_COND(
-      dim_order_is_valid(new_dim_order), "new dim order provided is invalid");
-  check_sizes(new_sizes);
-  update_metadata(new_sizes, new_dim_order);
-}
+      storage_type() == utils::kBuffer,
+      "virtual_reconfigure is only applicable for buffer backed tensors");
+  VK_CHECK_COND(new_sizes.size() == new_dim_order.size());
+  VK_CHECK_COND(dim_order_is_valid(new_dim_order));
 
-void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
   check_sizes(new_sizes);
-  update_size_metadata(new_sizes);
-}
-
-void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
-  update_size_metadata(new_sizes);
-  storage_.discard_and_reallocate(
-      calculate_padded_sizes(new_sizes, memory_layout_),
-      memory_layout_,
-      dtype_);
-}
-
-//
-// vTensorStorage
-//
-
-vkapi::VulkanImage allocate_image(
-    Context* const context_ptr,
-    utils::uvec3& image_extents,
-    const utils::StorageType storage_type,
-    const VkFormat image_format,
-    const bool allocate_memory) {
-  vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();
-
-  vkapi::ImageSampler::Properties sampler_props{
-      VK_FILTER_NEAREST,
-      VK_SAMPLER_MIPMAP_MODE_NEAREST,
-      VK_SAMPLER_ADDRESS_MODE_REPEAT,
-      VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
-  };
-
-  VkImageType image_type = VK_IMAGE_TYPE_3D;
-  VkImageViewType image_view_type;
-
-  switch (storage_type) {
-    case utils::kTexture3D:
-      image_type = VK_IMAGE_TYPE_3D;
-      image_view_type = VK_IMAGE_VIEW_TYPE_3D;
-      break;
-    case utils::kTexture2D:
-      image_type = VK_IMAGE_TYPE_2D;
-      image_view_type = VK_IMAGE_VIEW_TYPE_2D;
-      break;
-    default:
-      // Return an empty VulkanImage by default
-      return vkapi::VulkanImage();
-  }
-
-  VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props);
-
-  return adapter_ptr->vma().create_image(
-      vkapi::create_extent3d(image_extents),
-      image_format,
-      image_type,
-      image_view_type,
-      sampler_props,
-      sampler,
-      /*allow_transfer = */ true,
-      /*allocate_memory = */ allocate_memory);
-}
-
-vkapi::VulkanBuffer allocate_buffer(
-    Context* const context_ptr,
-    const int64_t numel,
-    const utils::StorageType storage_type,
-    const vkapi::ScalarType dtype,
-    const bool allocate_memory) {
-  vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();
-
-  switch (storage_type) {
-    case utils::kBuffer:
-      break;
-    default:
-      // Return an empty VulkanBuffer if Buffer storage is not used
-      return vkapi::VulkanBuffer();
-  }
-
-  return adapter_ptr->vma().create_storage_buffer(
-      element_size(dtype) * numel, /*gpu_only = */ true, allocate_memory);
+  sizes_ = new_sizes;
+  dim_order_ = new_dim_order;
+  update_metadata();
 }
 
-vTensorStorage::vTensorStorage(
-    Context* const context,
-    const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout gpu_memory_layout,
-    const std::vector<int64_t>& padded_sizes,
-    const vkapi::ScalarType dtype,
-    const bool allocate_memory)
-    : context_(context),
-      storage_type_{storage_type},
-      image_extents_(calculate_image_extents(padded_sizes, gpu_memory_layout)),
-      buffer_length_{utils::multiply_integers(padded_sizes)},
-      buffer_offset_{0},
-      image_(allocate_image(
-          context_,
-          image_extents_,
-          storage_type_,
-          to_vkformat(dtype),
-          allocate_memory)),
-      buffer_(allocate_buffer(
-          context_,
-          buffer_length_,
-          storage_type_,
-          dtype,
-          allocate_memory)),
-      last_access_{} {}
-
-vTensorStorage::vTensorStorage(
-    const vTensorStorage& other,
-    const int64_t buffer_offset)
-    : context_(other.context_),
-      storage_type_{other.storage_type_},
-      image_extents_(other.image_extents_),
-      buffer_length_{other.buffer_length_},
-      buffer_offset_{buffer_offset},
-      image_(),
-      buffer_(other.buffer_, buffer_offset),
-      last_access_{other.last_access_} {
-  if (other.storage_type_ != utils::kBuffer) {
-    VK_THROW("Tensors with texture storage cannot be copied!");
-  }
-}
+void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
+  VK_CHECK_COND(
+      new_sizes.size() == dim_order_.size(),
+      "new sizes cannot modify the dimensionality of the tensor ");
 
-vTensorStorage::~vTensorStorage() {
-  flush();
+  check_sizes(new_sizes);
+  sizes_ = new_sizes;
+  update_metadata();
 }
 
-void vTensorStorage::flush() {
-  if (image_) {
-    context_->register_image_cleanup(image_);
-  } else if (buffer_) {
-    context_->register_buffer_cleanup(buffer_);
+/*
+ * Transposing the dim order is a bit unintuitive. dim0 and dim1 have swapped
+ * their "identities", so we need to swap the values of dim0 and dim1 wherever
+ * they appear in the dim order vector. Compare this to just swapping the
+ * elements at dim0 and dim1 in the `sizes` vectors.
+ */
+void transpose_dim_order_inplace(
+    std::vector<int64_t>& dim_order,
+    const int64_t dim0,
+    const int64_t dim1) {
+  for (int i = 0; i < dim_order.size(); ++i) {
+    if (dim_order[i] == dim0) {
+      dim_order[i] = dim1;
+    } else if (dim_order[i] == dim1) {
+      dim_order[i] = dim0;
+    }
   }
-  last_access_ = {};
 }
 
-void vTensorStorage::transition(
-    vkapi::PipelineBarrier& pipeline_barrier,
-    const vkapi::PipelineStageFlags cur_stage,
-    const vkapi::MemoryAccessFlags cur_access) {
-  // Get last stage access
-  vkapi::PipelineStageFlags prev_stage = last_access_.stage;
-  vkapi::MemoryAccessFlags prev_access = last_access_.access;
+void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
+  std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1);
 
-  const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0;
-
-  VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED;
-  VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED;
-  bool layout_changed = false;
-  if (image_) {
-    cur_layout = image_.layout();
-    new_layout = vkapi::vk_layout(cur_stage, cur_access);
-
-    layout_changed = cur_layout != new_layout;
+  const int dim0_whcn = sizes_.size() - 1 - dim0;
+  const int dim1_whcn = sizes_.size() - 1 - dim1;
+  if (packed_dim_ == dim0_whcn) {
+    packed_dim_ = dim1_whcn;
+  } else if (packed_dim_ == dim1_whcn) {
+    packed_dim_ = dim0_whcn;
   }
 
-  if (prev_written || layout_changed) {
-    VkPipelineStageFlags src_stage = vkapi::vk_stage(prev_stage);
-    if (0u == src_stage) {
-      src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-    }
-    VkPipelineStageFlags dst_stage = vkapi::vk_stage(cur_stage);
-    if (0u == dst_stage) {
-      dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
-    }
-
-    pipeline_barrier.stage.src |= src_stage;
-    pipeline_barrier.stage.dst |= dst_stage;
-
-    if (image_) {
-      pipeline_barrier.images.emplace_back(
-          vkapi::vk_access(prev_stage, prev_access),
-          vkapi::vk_access(cur_stage, cur_access),
-          cur_layout,
-          new_layout,
-          image_);
-
-      image_.set_layout(new_layout);
-    } else if (buffer_) {
-      pipeline_barrier.buffers.emplace_back(
-          vkapi::vk_access(prev_stage, prev_access),
-          vkapi::vk_access(cur_stage, cur_access),
-          buffer_);
+  if (storage_type() == utils::kBuffer) {
+    transpose_dim_order_inplace(dim_order_, dim0, dim1);
+  } else {
+    // Cannot transpose batch dimension for texture storage
+    VK_CHECK_COND(dim0_whcn < 3 && dim1_whcn < 3);
+    std::iter_swap(
+        axis_map_.begin() + dim0_whcn, axis_map_.begin() + dim1_whcn);
+    // Update the "identity" of the concatted dimension
+    if (axis_map_.at(3) == dim0_whcn) {
+      axis_map_.at(3) = dim1_whcn;
+    } else if (axis_map_.at(3) == dim1_whcn) {
+      axis_map_.at(3) = dim0_whcn;
     }
   }
-
-  last_access_.stage = cur_stage;
-  last_access_.access = cur_access;
-}
-
-bool vTensorStorage::is_copy_of(const vTensorStorage& other) const {
-  if (storage_type_ != other.storage_type_) {
-    return false;
-  }
-  if (storage_type_ == utils::kBuffer) {
-    return buffer_.is_copy_of(other.buffer_);
-  }
-  return false;
-}
-
-void vTensorStorage::discard_and_reallocate(
-    const std::vector<int64_t>& padded_sizes,
-    const utils::GPUMemoryLayout gpu_memory_layout,
-    const vkapi::ScalarType dtype) {
-  const bool image_owns_memory = image_.owns_memory();
-  const bool buffer_owns_memory = buffer_.owns_memory();
-
-  flush();
-
-  image_extents_ = calculate_image_extents(padded_sizes, gpu_memory_layout);
-  image_ = allocate_image(
-      context_,
-      image_extents_,
-      storage_type_,
-      to_vkformat(dtype),
-      image_owns_memory);
-
-  buffer_length_ = utils::multiply_integers(padded_sizes);
-  buffer_ = allocate_buffer(
-      context_, buffer_length_, storage_type_, dtype, buffer_owns_memory);
+  update_metadata();
 }
 
 } // namespace api
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 11747c262d8..bbc80b85831 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -19,15 +19,6 @@
 namespace vkcompute {
 namespace api {
 
-/*
- * Given the strides of a tensor in NCHW dimension order, calculate the dim
- * order of the tensor by computing an index sort of the strides. Note that
- * there is some ambiguity when multiple dimensions have the same stride;
- * stable_sort is used to preserve the ordering of "outer" dimensions with
- * respect to "inner" dimensions.
- */
-std::vector<int64_t> strides_to_dim_order(const std::vector<int64_t>& strides);
-
 /*
  * Given a GPUMemoryLayout value, produce a dim order vector that matches the
  * given memory layout. The produced dim order vector will be in the NCHW
@@ -35,7 +26,7 @@ std::vector<int64_t> strides_to_dim_order(const std::vector<int64_t>& strides);
  */
 std::vector<int64_t> calculate_dim_order(
     const size_t ndim,
-    const utils::GPUMemoryLayout memory_layout);
+    const int32_t packed_dim);
 
 /*
  * Given the sizes of a tensor and the dim order of the tensor (both in NCHW)
@@ -66,15 +57,15 @@ std::vector<int64_t> unsqueeze_strides(
  */
 std::vector<int64_t> calculate_padded_sizes(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout);
+    const int32_t packed_dim);
 
 /*
- * Given the padded sizes of a tensor and the GPU memory layout, calculate the
- * 3D image extents required to store the tensor data as an image texture.
+ * Calculate the image extents required of a texture backed tensor.
  */
 utils::uvec3 calculate_image_extents(
     const std::vector<int64_t>& padded_sizes,
-    const utils::GPUMemoryLayout memory_layout);
+    const std::vector<int64_t>& axis_map,
+    const int32_t packed_dim);
 
 struct LastAccess {
   vkapi::PipelineStageFlags stage;
@@ -98,8 +89,9 @@ class vTensorStorage final {
   vTensorStorage(
       Context* context,
       const utils::StorageType storage_type,
-      const utils::GPUMemoryLayout gpu_memory_layout,
-      const std::vector<int64_t>& sizes,
+      const std::vector<int64_t>& axis_map,
+      const int32_t packed_dim,
+      const std::vector<int64_t>& padded_sizes,
       const vkapi::ScalarType dtype,
       const bool allocate_memory = true);
 
@@ -165,11 +157,6 @@ class vTensorStorage final {
    * Used for checking if this vTensorStorage is a copy of another instance
    */
   bool is_copy_of(const vTensorStorage& other) const;
-
-  void discard_and_reallocate(
-      const std::vector<int64_t>& padded_sizes,
-      const utils::GPUMemoryLayout gpu_memory_layout,
-      const vkapi::ScalarType dtype);
 };
 
 class vTensor final {
@@ -227,18 +214,59 @@ class vTensor final {
   vTensor& operator=(vTensor&& other) = default;
 
  private:
-  vkapi::ScalarType dtype_;
-  utils::GPUMemoryLayout memory_layout_;
+  /*
+   * "Core" tensor metadata. They are the minimum amount of information required
+   * to construct a tensor.
+   */
 
+  // Whether the tensor has elements of type float, int, etc.
+  vkapi::ScalarType dtype_;
   // sizes of the tensor in NCHW dimension order
   std::vector<int64_t> sizes_;
-  // dim order of the tensor in NCHW dimension order
+  // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
+  // width, 1 for height, etc.). For texture backed tensors, this describes
+  // which dimension is packed along a texel. For buffer backed tensors, this
+  // describes which dimension has a stride of 1 (i.e. is last in the dim
+  // order).
+  int32_t packed_dim_;
+
+  /*
+   * "Layout" metadata. These describe with further detail how tensor data is
+   * laid out in memory. However, they are considered secondary to the "core"
+   * metadata members above because defaults can be assumed based on a given
+   * memory layout. When permuting the tensor without performing a copy, these
+   * metadata members are the ones that will be changed. All other metadata is
+   * derived from a combination of sizes, memory layout, and the below members.
+   */
+
+  // dim order of the tensor; dimension indices are in NCHW dimension order
+  // i.e. 0 is N, 1 is C, 2 is H, 3 is W for a 4D tensor. The dims with larger
+  // strides precede the dims with smaller strides in the dim order. The last
+  // dim is always the fastest moving dim with a stride of 1.
   std::vector<int64_t> dim_order_;
+  // Describes which axis of an image texture each dimension of the tensor maps
+  // to. The axis mapping allows texture based tensors to be permuted and
+  // transposed without modifying the underlying texture storage. For a more in
+  // depth explanation of axis mapping, see the `default_axis_map()`
+  // function.
+  std::vector<int64_t> axis_map_;
+
+  /*
+   * The below can be consider "layout" metadata as well, but are derived from
+   * the above data members.
+   */
+
   // strides of the tensor in NCHW dimension order
   std::vector<int64_t> strides_;
   // Contains the number of elements in the tensor according to the canonical
   // sizes.
   size_t numel_;
+
+  /*
+   * The below metadata members are derived from the above, and are typically
+   * to i.e. pass tensor metadata to compute shaders.
+   */
+
   // padded sizes of the tensor in NCHW dimension order. See the
   // calculate_padded_sizes() function for more context. Note that padded sizes
   // are only used for texture storage, and not for buffer storage.
@@ -249,10 +277,8 @@ class vTensor final {
   // Contains the number of elements in the tensor according to the padded
   // sizes.
   size_t padded_numel_;
-  // Contains the "virtual" texture extents of the tensor. See the
-  // texture_limits() function for more context. Note that the texture limits
-  // are only relevant for texture storage, and not for buffer storage.
-  TextureLimits texture_limits_;
+  // See the comments documenting logical_limits() for more context.
+  TextureLimits logical_limits_;
 
   /*
    * Utility GPU buffers that can be passed to shaders in order to convey tensor
@@ -266,7 +292,8 @@ class vTensor final {
   ParamsBuffer sizes_uniform_;
   ParamsBuffer strides_uniform_;
   ParamsBuffer numel_uniform_;
-  ParamsBuffer texture_limits_uniform_;
+  ParamsBuffer axis_map_uniform_;
+  ParamsBuffer logical_limits_uniform_;
 
   vTensorStorage storage_;
 
@@ -313,8 +340,29 @@ class vTensor final {
     return storage_.storage_type_ == utils::kBuffer;
   }
 
-  inline const utils::uvec3& image_extents() const {
-    return storage_.image_extents_;
+ private:
+  void set_logical_limits(const utils::uvec3& image_extents);
+
+ public:
+  /*
+   * The logical limits of the tensor are derived from the image extents of the
+   * image texture used to store the tensor, but with two key differences.
+   *
+   * First, the image extents are permuted according to the axis map. This
+   * makes it so that the first element of the logical limit is the limit of the
+   * texture axis corresponding to the width dimension of the tensor, the next
+   * element is the limit of the texture axis corresponding to the height
+   * dimension and the last element is the limit of the texture axis that
+   * corresponds to the channels dimension of the tensor.
+   *
+   * Second, the logical limits may use smaller extents than the actual image
+   * extents of the image texture. This is due to dynamic shape; if the tensor's
+   * `virtual_resize()` function is called, then the logical limits will reflect
+   * the extents that would be needed to support a tensor with the updated sizes
+   * instead of the original sizes.
+   */
+  inline const utils::ivec3& logical_limits() const {
+    return logical_limits_.limits;
   }
 
   /*
@@ -324,12 +372,18 @@ class vTensor final {
     return dtype_;
   }
 
-  inline utils::GPUMemoryLayout gpu_memory_layout() const {
-    return memory_layout_;
-  }
+  /*
+   * Provide a "best guess" of a memory layout that can be used to construct a
+   * tensor with similar layout metadata (i.e. strides, axis_map, etc.) as this
+   * tensor. In some scenarios, the exact layout of the tensor may not be able
+   * to be replicated due to calling `virtual_*()` functions after construction;
+   * however, this function will provide a memory layout that will produce the
+   * same `packed_dim_` as this tensor.
+   */
+  utils::GPUMemoryLayout estimate_memory_layout() const;
 
-  inline int32_t packed_dim_whcn_idx() const {
-    return static_cast<int32_t>(memory_layout_);
+  inline int32_t packed_dim() const {
+    return packed_dim_;
   }
 
   inline const std::vector<int64_t>& sizes() const {
@@ -348,6 +402,10 @@ class vTensor final {
     return dim_order_;
   }
 
+  inline const std::vector<int64_t>& axis_map() const {
+    return axis_map_;
+  }
+
   inline const std::vector<int64_t>& strides() const {
     return strides_;
   }
@@ -372,25 +430,22 @@ class vTensor final {
   const vkapi::BufferBindInfo strides_ubo();
 
   /*
-   * Returns a GPU buffer containing the virtual image extents of the tensor.
-   * Since a tensor can be resized with the virtual_resize() function, this
-   * GPU buffer contains the image extents of the tensor calculated using the
-   * virtual_resize() function. This allows shaders to exit early if they are
-   * working outside the limits of the texture.
-   *
-   * This buffer should only be used to
+   * Returns a GPU buffer containing the texture axis mapping for each dimension
+   * of the tensor, in WHCN dimension order.
+   */
+  const vkapi::BufferBindInfo axis_map_ubo();
+
+  /*
+   * Returns a GPU buffer containing the logical limits of the tensor. See the
+   * comments for logical_limits() for more context.
    */
-  const vkapi::BufferBindInfo texture_limits_ubo();
+  const vkapi::BufferBindInfo logical_limits_ubo();
 
   /*
    * Returns the number of elements in the buffer used to store the tensor.
    */
   const vkapi::BufferBindInfo numel_ubo();
 
-  inline const utils::ivec3 texture_limits() const {
-    return texture_limits_.limits;
-  }
-
   inline size_t numel() const {
     return numel_;
   }
@@ -429,26 +484,10 @@ class vTensor final {
 
  private:
   /*
-   * Update the sizes, dim order, and strides metadata of the vTensor.
-   *
-   * The dim order is used as the "source of truth" for the strides and the
-   * strides are calculated from the dim order, therefore only the dim order is
-   * accepted as an argument to this function. Within the function, the new
-   * strides are computed from the new sizes and new dim order.
-   *
-   * Should not be used directly, reallocate() or virtual_resize() should be
-   * used instead.
-   */
-  void update_metadata(
-      const std::vector<int64_t>& new_sizes,
-      const std::vector<int64_t>& new_dim_order);
-
-  /*
-   * Convenience overload of update_metadata. Given the new sizes, the new
-   * strides will be re-calculated based on the current memory layout of the
-   * tensor. Update_metadata will be called with the new sizes and strides.
+   * Assuming sizes, dim order, or axis mapping was modified, recompute all
+   * derived metadata and update metadata UBO with new values.
    */
-  void update_size_metadata(const std::vector<int64_t>& new_sizes);
+  void update_metadata();
 
   /*
    * Check that tensor sizes are valid given the current storage resource's
@@ -458,13 +497,15 @@ class vTensor final {
 
  public:
   /*
-   * Virtually resize and "re-stride" the tensor by modifying the size and
-   * stride metadata that gets used in compute shaders. This allows the shader
-   * to interpret the underlying resource with the updated metadata.
+   * Change how the tensor should be interpreted by compute shaders via updating
+   * the size and dim order of the tensor. The new sizes and dim order may have
+   * different dimensionality than the current dimensionality of the tensor.
+   *
+   * This function can only be used for buffer-backed tensors, since texture
+   * backed buffers cannot change dimensionality or memory layout.
    *
-   * Note that the dim order is used as the source of truth for the strides; the
-   * strides are computed using the new sizes and new dim order, thus only the
-   * dim order is accepted as an argument to this function.
+   * TODO(ssjia): delete this API. prefer functions such as virtual_transpose
+   * instead.
    */
   void virtual_reconfigure(
       const std::vector<int64_t>& new_sizes,
@@ -473,17 +514,15 @@ class vTensor final {
   /*
    * Perform a virtual resize of the vTensor by modifying the size metadata that
    * gets used in compute shaders. This allows the shader to treat the
-   * underlying resource as if it were a different size. This function is a
-   * convenience overload of virtual_reconfigure; new strides will be computed
-   * based on the new sizes that preserves the memory layout of the tensor.
+   * underlying resource as if it were a different size. The new sizes cannot
+   * modify the dimensionality of the tensor.
    */
   void virtual_resize(const std::vector<int64_t>& new_sizes);
 
   /*
-   * Discard the underlying VkImage or VkBuffer and re-allocate based on new
-   * tensor sizes
+   * Transpose the tensor in-place by updating its metadata.
    */
-  void reallocate(const std::vector<int64_t>& new_sizes);
+  void virtual_transpose(const int64_t dim0, const int64_t dim1);
 
   /*
    * Check if this vTensor instance is a view of another vTensor instance
diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
index f4ba98b31fd..6ee29d45f18 100644
--- a/backends/vulkan/runtime/gen_vulkan_spv.py
+++ b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -38,6 +38,10 @@
 # Basic configuration settings for shaders
 DEFAULT_ENV: Dict[str, Any] = {
     "PRECISION": "highp",
+    # B is shorthand for "binding". This is used to automatically increment the
+    # layout binding index when declaring layout bindings. Note that a container
+    # type is used because integers are immutable in Python.
+    "B": [0],
 }
 
 # Establishes relationships between different tensor types and different GLSL types
@@ -179,8 +183,14 @@ def get_access_qualifier(access_type: Optional[str]) -> str:
     raise AssertionError(f"Invalid access type: {access_type}")
 
 
+def get_slot_val(slot: Union[int, List[int]]) -> int:
+    if isinstance(slot, list):
+        return slot[0]
+    return slot
+
+
 def layout_declare_buffer(
-    slot: int,
+    slot: Union[int, List[int]],
     access_type: str,
     var_name: str,
     dtype: str,
@@ -192,15 +202,18 @@ def layout_declare_buffer(
         array_type = buffer_scalar_type(dtype)
 
     out_str = f"""
-layout(set = 0, binding = {slot}) buffer {precision} restrict {get_access_qualifier(access_type)} {var_name}Buffer {{
+layout(set = 0, binding = {get_slot_val(slot)}) buffer {precision} restrict {get_access_qualifier(access_type)} {var_name}Buffer {{
     {array_type} {var_name}[];
 }};
 """
+
+    if isinstance(slot, list):
+        slot[0] = slot[0] + 1
     return out_str
 
 
 def layout_declare_image(
-    slot: int,
+    slot: Union[int, List[int]],
     access_type: str,
     var_name: str,
     dtype: str,
@@ -209,11 +222,16 @@ def layout_declare_image(
 ) -> str:
     image_format = TYPE_MAPPINGS["IMAGE_FORMAT"][dtype]
     image_type = TYPE_MAPPINGS["IMAGE_T"][image_ndim][dtype]
-    return f"layout(set = 0, binding = {slot}, {image_format}) uniform {precision} restrict {get_access_qualifier(access_type)} {image_type} {var_name};"
+
+    ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}, {image_format}) uniform {precision} restrict {get_access_qualifier(access_type)} {image_type} {var_name};"
+
+    if isinstance(slot, list):
+        slot[0] = slot[0] + 1
+    return ret_str
 
 
 def layout_declare_sampler(
-    slot: int,
+    slot: Union[int, List[int]],
     access_type: str,
     var_name: str,
     dtype: str,
@@ -222,11 +240,16 @@ def layout_declare_sampler(
     image_ndim: int = 3,
 ) -> str:
     sampler_type = TYPE_MAPPINGS["SAMPLER_T"][image_ndim][dtype]
-    return f"layout(set = 0, binding = {slot}) uniform {precision} {sampler_type} {var_name};"
+
+    ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} {sampler_type} {var_name};"
+
+    if isinstance(slot, list):
+        slot[0] = slot[0] + 1
+    return ret_str
 
 
 def layout_declare_tensor(
-    slot: int,
+    slot: Union[int, List[int]],
     access_type: str,
     var_name: str,
     dtype: str,
@@ -262,7 +285,9 @@ def layout_declare_tensor(
         )
 
 
-def layout_declare_ubo(slot: int, *args, precision: str = "PRECISION") -> str:
+def layout_declare_ubo(
+    slot: Union[int, List[int]], *args, precision: str = "PRECISION"
+) -> str:
     assert len(args) % 2 == 0
 
     var_list = list(zip(args[::2], args[1::2]))
@@ -272,12 +297,14 @@ def layout_declare_ubo(slot: int, *args, precision: str = "PRECISION") -> str:
         ubo_name += var_name + "_"
 
     out_str = f"""
-layout(set = 0, binding = {slot}) uniform {precision} restrict readonly {ubo_name}UBO {{
+layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} restrict readonly {ubo_name}UBO {{
 """
     for type_name, var_name in var_list:
         out_str += f"{type_name} {var_name};\n"
     out_str += "};"
 
+    if isinstance(slot, list):
+        slot[0] = slot[0] + 1
     return out_str
 
 
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index e014c52a3a4..64f24e3012d 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -38,14 +38,81 @@ namespace vkcompute {
 
 VALUE_PTR_CLASS_IMPL(vTensorPtr, api::vTensor, Tensor)
 VALUE_PTR_CLASS_IMPL(TensorRefPtr, TensorRef, TensorRef)
-VALUE_PTR_CLASS_IMPL(StagingPtr, api::StorageBuffer, Staging)
+VALUE_PTR_CLASS_IMPL(StagingPtr, api::StagingBuffer, Staging)
 VALUE_PTR_CLASS_IMPL(IntListPtr, std::vector<int64_t>, IntList)
 VALUE_PTR_CLASS_IMPL(DoubleListPtr, std::vector<double>, DoubleList)
 VALUE_PTR_CLASS_IMPL(BoolListPtr, std::vector<bool>, BoolList)
 VALUE_PTR_CLASS_IMPL(ValueListPtr, std::vector<ValueRef>, ValueList)
+VALUE_PTR_CLASS_IMPL(SymIntPtr, SymInt, SymInt)
 
 #undef VALUE_PTR_CLASS_IMPL
 
+//
+// TmpTensor
+//
+
+TmpTensor::TmpTensor(
+    ComputeGraph* const graph_ptr,
+    const std::vector<int64_t>& sizes,
+    const vkapi::ScalarType dtype,
+    const utils::StorageType storage_type,
+    const utils::GPUMemoryLayout memory_layout)
+    : graph_p(graph_ptr),
+      sobj_idx(get_sobj_idx()),
+      vref(graph_p->add_tensor(
+          sizes,
+          dtype,
+          storage_type,
+          memory_layout,
+          sobj_idx)) {}
+
+TmpTensor::TmpTensor(
+    ComputeGraph* const graph_ptr,
+    const std::vector<int64_t>& sizes,
+    const vkapi::ScalarType dtype,
+    const utils::StorageType storage_type)
+    : graph_p(graph_ptr),
+      sobj_idx(get_sobj_idx()),
+      vref(graph_p->add_tensor(sizes, dtype, storage_type, sobj_idx)) {}
+
+TmpTensor::TmpTensor(
+    ComputeGraph* const graph_ptr,
+    const std::vector<int64_t>& sizes,
+    const vkapi::ScalarType dtype,
+    const utils::GPUMemoryLayout memory_layout)
+    : graph_p(graph_ptr),
+      sobj_idx(get_sobj_idx()),
+      vref(graph_p->add_tensor(sizes, dtype, memory_layout, sobj_idx)) {}
+
+TmpTensor::TmpTensor(
+    ComputeGraph* const graph_ptr,
+    const std::vector<int64_t>& sizes,
+    const vkapi::ScalarType dtype)
+    : graph_p(graph_ptr),
+      sobj_idx(get_sobj_idx()),
+      vref(graph_p->add_tensor(sizes, dtype, sobj_idx)) {}
+
+TmpTensor::~TmpTensor() {
+  // Lifetime of this temporary tensor is expired; return the shared object to
+  // the pool, as long as the sobj index is valid
+  if (sobj_idx >= 0) {
+    graph_p->tmp_shared_object_idxs_.emplace(sobj_idx);
+  }
+}
+
+int64_t TmpTensor::get_sobj_idx() {
+  int64_t sobj_idx;
+  // If no available temporary shared objects, request a new one to be created
+  if (graph_p->tmp_shared_object_idxs_.empty()) {
+    sobj_idx = graph_p->shared_objects_.size();
+  } else {
+    // Get the first available shared object idx
+    sobj_idx = graph_p->tmp_shared_object_idxs_.top();
+    graph_p->tmp_shared_object_idxs_.pop();
+  }
+  return sobj_idx;
+}
+
 //
 // ComputeGraph
 //
@@ -146,7 +213,7 @@ std::vector<int64_t> ComputeGraph::dim_order_of(const ValueRef idx) const {
   if (val.isTensor()) {
     return val.toConstTensor().dim_order();
   }
-  VK_THROW("Could not get strides of value with type ", val.type());
+  VK_THROW("Could not get dim order of value with type ", val.type());
 }
 
 std::vector<int64_t> ComputeGraph::strides_of(const ValueRef idx) const {
@@ -262,7 +329,7 @@ ValueRef ComputeGraph::add_staging(
     const size_t numel) {
   ValueRef idx(static_cast<int>(values_.size()));
   check_no_active_value_ptrs();
-  values_.emplace_back(api::StorageBuffer(context(), dtype, numel));
+  values_.emplace_back(api::StagingBuffer(context(), dtype, numel));
   return idx;
 }
 
@@ -287,6 +354,13 @@ ValueRef ComputeGraph::add_string(std::string&& str) {
   return idx;
 }
 
+ValueRef ComputeGraph::add_symint(const int32_t val) {
+  ValueRef idx(static_cast<int>(values_.size()));
+  check_no_active_value_ptrs();
+  values_.emplace_back(SymInt(context(), val));
+  return idx;
+}
+
 ValueRef ComputeGraph::set_input_tensor(
     const ValueRef idx,
     const bool use_staging) {
@@ -326,6 +400,22 @@ ValueRef ComputeGraph::set_output_tensor(
   return idx;
 }
 
+vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer(
+    const ValueRef idx) {
+  if (values_.at(idx).isInt()) {
+    const int32_t val = extract_scalar<int32_t>(idx);
+    create_params_buffer(val);
+  } else if (values_.at(idx).isSymInt()) {
+    SymIntPtr symint = get_symint(idx);
+    return vkapi::BufferBindInfo(symint->gpu_buffer.buffer());
+  }
+  VK_THROW("Cannot create a int param buffer for the given value");
+}
+
+void ComputeGraph::set_symint(const ValueRef idx, const int32_t val) {
+  get_symint(idx)->set(val);
+}
+
 SharedObject& ComputeGraph::get_shared_object(const int64_t idx) {
   if (idx >= shared_objects_.size()) {
     shared_objects_.resize(static_cast<size_t>(idx + 1));
@@ -364,7 +454,7 @@ utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
   if (is_buffer_storage(idx)) {
     return {uint32_t(numel_of(idx)), 1u, 1u};
   }
-  return image_extents_of(idx);
+  return logical_limits_of(idx);
 }
 
 utils::uvec3 ComputeGraph::create_local_wg_size(
@@ -403,7 +493,7 @@ void ComputeGraph::copy_into_staging(
     const size_t numel) {
   StagingPtr staging = get_staging(idx);
   size_t nbytes = numel * vkapi::element_size(staging->dtype());
-  copy_ptr_to_staging(data, *staging, nbytes);
+  staging->copy_from(data, nbytes);
 }
 
 void ComputeGraph::copy_from_staging(
@@ -412,7 +502,7 @@ void ComputeGraph::copy_from_staging(
     const size_t numel) {
   StagingPtr staging = get_staging(idx);
   size_t nbytes = numel * vkapi::element_size(staging->dtype());
-  copy_staging_to_ptr(*staging, data, nbytes);
+  staging->copy_to(data, nbytes);
 }
 
 void ComputeGraph::prepare() {
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index b73b552067c..d61ff7e61f6 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -11,6 +11,7 @@
 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
 
 #include <optional>
+#include <stack>
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
@@ -58,14 +59,88 @@ class ComputeGraph;
 
 DECL_VALUE_PTR_CLASS(vTensorPtr, api::vTensor)
 DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef)
-DECL_VALUE_PTR_CLASS(StagingPtr, api::StorageBuffer)
+DECL_VALUE_PTR_CLASS(StagingPtr, api::StagingBuffer)
 DECL_VALUE_PTR_CLASS(IntListPtr, std::vector<int64_t>)
 DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector<double>)
 DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector<bool>)
 DECL_VALUE_PTR_CLASS(ValueListPtr, std::vector<ValueRef>)
+DECL_VALUE_PTR_CLASS(SymIntPtr, SymInt);
 
 #undef DECL_VALUE_PTR_CLASS
 
+//
+// TmpTensor
+//
+
+/*
+ * This struct is used to recycle the memory of temporary tensors that are
+ * created during the execution of a node. Upon construction, this struct will
+ * check the `tmp_shared_object_idxs_` of the provided `ComputeGraph` instance
+ * if any shared objects are available; if not, then a new one is created. A
+ * tensor value is then added to the `ComputeGraph` instance with the requested
+ * specifications. Upon destruction, the shared object index of the temporary
+ * tensor is returned to `tmp_shared_object_idxs_`.
+ *
+ * Note that instances of this struct can be used as if they were `ValueRef` due
+ * to implementation of a custom casting operator.
+ *
+ * This class should only be used to create tensors whose lifetimes exist only
+ * in a well defined scope (i.e. within a function).
+ */
+struct TmpTensor {
+  ComputeGraph* graph_p;
+  int64_t sobj_idx;
+  ValueRef vref;
+
+  //
+  // Match all available overloads of `add_tensor`
+  //
+
+  TmpTensor(
+      ComputeGraph* const graph_ptr,
+      const std::vector<int64_t>& sizes,
+      const vkapi::ScalarType dtype,
+      const utils::StorageType storage_type,
+      const utils::GPUMemoryLayout memory_layout);
+
+  TmpTensor(
+      ComputeGraph* const graph_ptr,
+      const std::vector<int64_t>& sizes,
+      const vkapi::ScalarType dtype,
+      const utils::StorageType storage_type);
+
+  TmpTensor(
+      ComputeGraph* const graph_ptr,
+      const std::vector<int64_t>& sizes,
+      const vkapi::ScalarType dtype,
+      const utils::GPUMemoryLayout memory_layout);
+
+  TmpTensor(
+      ComputeGraph* const graph_ptr,
+      const std::vector<int64_t>& sizes,
+      const vkapi::ScalarType dtype);
+
+  // No copy construction or assignment
+  TmpTensor(TmpTensor& other) = delete;
+  TmpTensor& operator=(TmpTensor& other) = delete;
+
+  // No move construction or assignment
+  TmpTensor(TmpTensor&& other) = delete;
+  TmpTensor& operator=(TmpTensor&& other) = delete;
+
+  // Custom cast to ValueRef
+  operator ValueRef() const {
+    return vref;
+  };
+
+  ~TmpTensor();
+
+ private:
+  // Helper function to get first available shared object index or request a new
+  // one to be created.
+  int64_t get_sobj_idx();
+};
+
 //
 // ComputeGraph
 //
@@ -93,7 +168,12 @@ class ComputeGraph final {
   vkapi::DescriptorPoolConfig execute_descriptor_counts_;
 
   std::unique_ptr<api::Context> context_;
+
   std::vector<SharedObject> shared_objects_;
+  // This stack is used by `TmpTensor` instances to recycle shared objects
+  // for temporary tensors. See the comments of `TmpTensor` for more details
+  std::stack<int64_t> tmp_shared_object_idxs_;
+
   std::vector<Value> values_;
   std::vector<api::ParamsBuffer> param_ubos_;
 
@@ -154,6 +234,7 @@ class ComputeGraph final {
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(DoubleListPtr, double_list, DoubleList)
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(BoolListPtr, bool_list, BoolList)
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ValueListPtr, value_list, ValueList)
+  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(SymIntPtr, symint, SymInt);
 
 #undef GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS
 
@@ -209,8 +290,8 @@ class ComputeGraph final {
 
   vkapi::ScalarType dtype_of(const ValueRef idx) const;
 
-  inline utils::uvec3 image_extents_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().image_extents();
+  inline const utils::ivec3& logical_limits_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().logical_limits();
   }
 
   inline int32_t numel_of(const ValueRef idx) const {
@@ -232,12 +313,13 @@ class ComputeGraph final {
         .is_view_of(values_.at(base).toConstTensor());
   }
 
-  inline utils::GPUMemoryLayout memory_layout_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().gpu_memory_layout();
+  inline utils::GPUMemoryLayout estimate_memory_layout_of(
+      const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().estimate_memory_layout();
   }
 
-  inline int32_t packed_dim_whcn_idx_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().packed_dim_whcn_idx();
+  inline int32_t packed_dim_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().packed_dim();
   }
 
   inline vkapi::BufferBindInfo sizes_ubo(const ValueRef idx) {
@@ -252,8 +334,12 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().numel_ubo();
   }
 
-  inline vkapi::BufferBindInfo texture_limits_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().texture_limits_ubo();
+  inline vkapi::BufferBindInfo axis_map_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().axis_map_ubo();
+  }
+
+  inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().logical_limits_ubo();
   }
 
   //
@@ -428,15 +514,28 @@ class ComputeGraph final {
 
   ValueRef add_string(std::string&& str);
 
+  ValueRef add_symint(const int32_t val);
+
   ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
   ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);
 
   template <typename Block>
-  const vkapi::BufferBindInfo create_params_buffer(const Block& data) {
+  vkapi::BufferBindInfo create_params_buffer(const Block& data) {
     param_ubos_.emplace_back(api::ParamsBuffer(context_.get(), data));
     return vkapi::BufferBindInfo(param_ubos_.back().buffer());
   }
 
+  /*
+   * Given a ValueRef, do the following depending on the type of the Value:
+   * - If it is a SymInt, return the BufferBindInfo of the ParamsBuffer object
+   *   backing the SymInt.
+   * - If it is a regular Int, create a new ParamsBuffer using the integer value
+   *   and return the BufferBindInfo of the created ParamsBuffer.
+   */
+  vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx);
+
+  void set_symint(const ValueRef idx, const int32_t val);
+
   /*
    * Convenience function to add an input tensor along with its staging buffer
    */
@@ -583,6 +682,9 @@ class ComputeGraph final {
   friend class DoubleListPtr;
   friend class BoolListPtr;
   friend class ValueListPtr;
+  friend class SymIntPtr;
+
+  friend struct TmpTensor;
 };
 
 template <typename T>
diff --git a/backends/vulkan/runtime/graph/Logging.cpp b/backends/vulkan/runtime/graph/Logging.cpp
index 2e4833bfc64..e05fa4e4876 100644
--- a/backends/vulkan/runtime/graph/Logging.cpp
+++ b/backends/vulkan/runtime/graph/Logging.cpp
@@ -71,8 +71,8 @@ void ComputeGraph::print_readable() {
             << std::setfill(' ') << std::endl;
 
   std::cout << std::setw(6) << "idx" << std::setw(10) << "type" << std::setw(20)
-            << "sizes" << std::setw(10) << "node_type" << std::setw(10)
-            << "so_idx" << std::endl;
+            << "sizes" << std::setw(10) << "node_type" << std::setw(15)
+            << "storage_bytes" << std::setw(10) << "so_idx" << std::endl;
 
   size_t value_idx = 0;
   for (Value& val : values_) {
@@ -108,6 +108,16 @@ void ComputeGraph::print_readable() {
       }
     }
 
+    // Actual storage bytes used
+    std::cout << std::setw(15);
+    if (val.isTensor()) {
+      const api::vTensor& v_tensor = val.toTensor();
+      auto memory_reqs = v_tensor.get_memory_requirements();
+      std::cout << memory_reqs.size;
+    } else {
+      std::cout << "";
+    }
+
     std::cout << std::setw(10);
     if (value_ref_to_shared_object_idx.count(value_idx) > 0) {
       size_t shared_obj_idx = value_ref_to_shared_object_idx.at(value_idx);
diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.cpp b/backends/vulkan/runtime/graph/containers/SharedObject.cpp
index 0d8b77a5b74..f2474da6673 100644
--- a/backends/vulkan/runtime/graph/containers/SharedObject.cpp
+++ b/backends/vulkan/runtime/graph/containers/SharedObject.cpp
@@ -15,10 +15,7 @@ namespace vkcompute {
 void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) {
   vTensorPtr t = graph->get_tensor(idx);
 
-  //
   // Aggregate Memory Requirements
-  //
-
   const VkMemoryRequirements mem_reqs = t->get_memory_requirements();
   aggregate_memory_requirements.size =
       std::max(mem_reqs.size, aggregate_memory_requirements.size);
@@ -26,27 +23,6 @@ void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) {
       std::max(mem_reqs.alignment, aggregate_memory_requirements.alignment);
   aggregate_memory_requirements.memoryTypeBits |= mem_reqs.memoryTypeBits;
 
-  //
-  // Aggregate Allocation Create Info
-  //
-
-  const VmaAllocationCreateInfo create_info = t->get_allocation_create_info();
-  // Clear out CREATE_STRATEGY bit flags in case of conflict
-  VmaAllocationCreateFlags clear_mask = ~VMA_ALLOCATION_CREATE_STRATEGY_MASK;
-  VmaAllocationCreateFlags create_flags = create_info.flags & clear_mask;
-  // Use the default allocation strategy
-  aggregate_create_info.flags =
-      create_flags | vkapi::DEFAULT_ALLOCATION_STRATEGY;
-
-  // Set the usage flag if it is currently not set
-  if (aggregate_create_info.usage == VMA_MEMORY_USAGE_UNKNOWN) {
-    aggregate_create_info.usage = create_info.usage;
-  }
-  // Otherwise check that there is no conflict regarding usage
-  VK_CHECK_COND(aggregate_create_info.usage == create_info.usage);
-  aggregate_create_info.requiredFlags |= create_info.requiredFlags;
-  aggregate_create_info.preferredFlags |= create_info.preferredFlags;
-
   users.emplace_back(idx);
 }
 
@@ -54,8 +30,12 @@ void SharedObject::allocate(ComputeGraph* const graph) {
   if (aggregate_memory_requirements.size == 0) {
     return;
   }
+
+  VmaAllocationCreateInfo alloc_create_info =
+      graph->context()->adapter_ptr()->vma().gpuonly_resource_create_info();
+
   allocation = graph->context()->adapter_ptr()->vma().create_allocation(
-      aggregate_memory_requirements, aggregate_create_info);
+      aggregate_memory_requirements, alloc_create_info);
 }
 
 void SharedObject::bind_users(ComputeGraph* const graph) {
diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.h b/backends/vulkan/runtime/graph/containers/SharedObject.h
index 37e80257f46..bd77f6f39ba 100644
--- a/backends/vulkan/runtime/graph/containers/SharedObject.h
+++ b/backends/vulkan/runtime/graph/containers/SharedObject.h
@@ -28,7 +28,6 @@ struct SharedObject {
   explicit SharedObject() = default;
 
   VkMemoryRequirements aggregate_memory_requirements;
-  VmaAllocationCreateInfo aggregate_create_info;
   std::vector<ValueRef> users;
   vkapi::Allocation allocation;
 
diff --git a/backends/vulkan/runtime/graph/containers/SymInt.cpp b/backends/vulkan/runtime/graph/containers/SymInt.cpp
new file mode 100644
index 00000000000..c91db84b787
--- /dev/null
+++ b/backends/vulkan/runtime/graph/containers/SymInt.cpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/containers/SymInt.h>
+
+namespace vkcompute {
+
+SymInt::SymInt(api::Context* context_p, const int32_t val)
+    : gpu_buffer(context_p, val){};
+
+void SymInt::set(const int32_t val) {
+  gpu_buffer.update(val);
+}
+
+void SymInt::operator=(const int32_t val) {
+  gpu_buffer.update(val);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/SymInt.h b/backends/vulkan/runtime/graph/containers/SymInt.h
new file mode 100644
index 00000000000..0c9fbee5fe2
--- /dev/null
+++ b/backends/vulkan/runtime/graph/containers/SymInt.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/Context.h>
+#include <executorch/backends/vulkan/runtime/api/containers/ParamsBuffer.h>
+
+namespace vkcompute {
+
+/*
+ * Represents a symbolic integer whose value can be variable. It is implemented
+ * as a thin wrapper around a `ParamsBuffer` object that holds the value of the
+ * integer. The `ParamsBuffer` object allows the value of the symbolic integer
+ * to be changed from the CPU and have those changes be visible to all shaders
+ * that use the symbolic integer; it also allows the value of the symbolic
+ * integer to be the result of a compute shader.
+ *
+ * Regular scalar types represented by `TypeTag::INT` cannot be used for
+ * symbolic integers because their value is assumed to be constant; therefore
+ * the `Value` instance holding the value of the scalar does not contain
+ * any reference to the GPU buffers used to pass its value into compute shaders.
+ * Therefore, updating the value of the scalar does not impact the value seen
+ * by compute shaders.
+ */
+struct SymInt final {
+  api::ParamsBuffer gpu_buffer;
+
+  explicit SymInt(api::Context* context_p, const int32_t val);
+
+  void set(const int32_t val);
+
+  void operator=(const int32_t val);
+};
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/Types.cpp b/backends/vulkan/runtime/graph/containers/Types.cpp
index c5ffc65add1..e7a8951a552 100644
--- a/backends/vulkan/runtime/graph/containers/Types.cpp
+++ b/backends/vulkan/runtime/graph/containers/Types.cpp
@@ -29,6 +29,7 @@ std::ostream& operator<<(std::ostream& out, const TypeTag& tag) {
     PRINT_CASE(BOOLLIST)
     PRINT_CASE(VALUELIST)
     PRINT_CASE(STRING)
+    PRINT_CASE(SYMINT)
   }
   return out;
 }
diff --git a/backends/vulkan/runtime/graph/containers/Types.h b/backends/vulkan/runtime/graph/containers/Types.h
index 79edbd50d3a..5840d1695ee 100644
--- a/backends/vulkan/runtime/graph/containers/Types.h
+++ b/backends/vulkan/runtime/graph/containers/Types.h
@@ -36,6 +36,7 @@ enum class TypeTag : uint32_t {
   // Special Type
   VALUELIST,
   STRING,
+  SYMINT,
 };
 
 std::ostream& operator<<(std::ostream& out, const TypeTag& tag);
diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h
index ba82213c6f8..8773f0c0b04 100644
--- a/backends/vulkan/runtime/graph/containers/Value.h
+++ b/backends/vulkan/runtime/graph/containers/Value.h
@@ -13,6 +13,7 @@
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
 #include <executorch/backends/vulkan/runtime/graph/containers/Constant.h>
+#include <executorch/backends/vulkan/runtime/graph/containers/SymInt.h>
 #include <executorch/backends/vulkan/runtime/graph/containers/Types.h>
 
 namespace vkcompute {
@@ -28,6 +29,11 @@ inline bool is_valid(ValueRef value_ref) {
 struct IOValueRef {
   ValueRef value;
   ValueRef staging;
+
+  // Custom cast to ValueRef
+  operator ValueRef() const {
+    return value;
+  };
 };
 
 /*
@@ -53,7 +59,7 @@ struct Value final {
     } u;
 
     api::vTensor as_tensor;
-    api::StorageBuffer as_staging;
+    api::StagingBuffer as_staging;
     TensorRef as_tensorref;
 
     std::vector<int64_t> as_int_list;
@@ -67,6 +73,8 @@ struct Value final {
 
     std::string as_string;
 
+    SymInt as_symint;
+
     Payload() : u() {}
     // NOLINTNEXTLINE
     ~Payload(){};
@@ -108,7 +116,7 @@ struct Value final {
       CASE_MOVE_MOVEABLE_TYPE(
           TypeTag::TENSOR, api::vTensor, as_tensor, vTensor);
       CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::STAGING, api::StorageBuffer, as_staging, StorageBuffer);
+          TypeTag::STAGING, api::StagingBuffer, as_staging, StagingBuffer);
       CASE_MOVE_MOVEABLE_TYPE(
           TypeTag::TENSORREF, TensorRef, as_tensorref, TensorRef);
       // Scalar lists
@@ -123,6 +131,7 @@ struct Value final {
           TypeTag::VALUELIST, std::vector<ValueRef>, as_value_list, vector);
       CASE_MOVE_MOVEABLE_TYPE(
           TypeTag::STRING, std::string, as_string, basic_string);
+      CASE_MOVE_MOVEABLE_TYPE(TypeTag::SYMINT, SymInt, as_symint, SymInt);
 
       case TypeTag::NONE:
         clearToNone();
@@ -152,7 +161,7 @@ struct Value final {
         payload.as_tensor.~vTensor();
         break;
       case TypeTag::STAGING:
-        payload.as_staging.~StorageBuffer();
+        payload.as_staging.~StagingBuffer();
         break;
       case TypeTag::TENSORREF:
         payload.as_tensorref.~TensorRef();
@@ -172,6 +181,9 @@ struct Value final {
       case TypeTag::STRING:
         payload.as_string.~basic_string();
         break;
+      case TypeTag::SYMINT:
+        payload.as_symint.~SymInt();
+        break;
       // Manually list out the types so that if a type here is added later and
       // not handled the compiler can catch it.
       case TypeTag::NONE:
@@ -247,7 +259,7 @@ struct Value final {
       as_tensor);
 
   SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
-      api::StorageBuffer,
+      api::StagingBuffer,
       Staging,
       TypeTag::STAGING,
       as_staging);
@@ -288,6 +300,8 @@ struct Value final {
       TypeTag::STRING,
       as_string);
 
+  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(SymInt, SymInt, TypeTag::SYMINT, as_symint);
+
 #undef SUPPORT_TRIVIALLY_COPYABLE_TYPE
 #undef SUPPORT_TRIVIALLY_MOVEABLE_TYPE
 
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index b77c62920dd..61b24cd409b 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -45,24 +45,23 @@ PrepackNode::PrepackNode(
   graph.update_descriptor_counts(noop_shader_, /*execute = */ false);
 }
 
-api::StorageBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
+api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
   vTensorPtr packed = graph->get_tensor(packed_);
 
   // If no TensorRef is provided, create a staging buffer of zeros according to
   // the vkapi::vTensor metadata.
   if (graph->val_is_none(tref_)) {
     size_t numel = utils::multiply_integers(packed->sizes());
-    api::StorageBuffer staging(graph->context(), packed->dtype(), numel);
-    size_t nbytes = numel * vkapi::element_size(packed->dtype());
-    set_staging_zeros(staging, nbytes);
+    api::StagingBuffer staging(graph->context(), packed->dtype(), numel);
+    staging.set_staging_zeros();
     return staging;
   }
 
   TensorRefPtr tref = graph->get_tref(tref_);
   size_t numel = utils::multiply_integers(tref->sizes);
-  api::StorageBuffer staging(graph->context(), tref->dtype, numel);
+  api::StagingBuffer staging(graph->context(), tref->dtype, numel);
   size_t nbytes = numel * vkapi::element_size(tref->dtype);
-  copy_ptr_to_staging(tref->data, staging, nbytes);
+  staging.copy_from(tref->data, nbytes);
   return staging;
 }
 
@@ -70,7 +69,7 @@ void PrepackNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
 
   vTensorPtr packed = graph->get_tensor(packed_);
-  api::StorageBuffer staging = create_staging_buffer(graph);
+  api::StagingBuffer staging = create_staging_buffer(graph);
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h
index c3ac8b963fd..3e713303c3d 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.h
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h
@@ -56,7 +56,7 @@ class PrepackNode final {
   const vkapi::SpecVarList spec_vars_;
 
  private:
-  api::StorageBuffer create_staging_buffer(ComputeGraph* graph);
+  api::StagingBuffer create_staging_buffer(ComputeGraph* graph);
 };
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/glsl/activations.h b/backends/vulkan/runtime/graph/ops/glsl/activations.h
index c5ee3b20855..94c9e1274de 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/activations.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/activations.h
@@ -18,7 +18,7 @@ float hardswish(float x) {
 
 vec4 hardswish(vec4 tex) {
   return vec4(
-      hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.z));
+      hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.w));
 }
 
 float hardshrink(float x, float lambda, float neg_lambda) {
@@ -30,3 +30,15 @@ vec4 hardshrink(vec4 tex, float lambda, float neg_lambda) {
       (vec4(greaterThan(tex, vec4(lambda))) +
        vec4(lessThan(tex, vec4(neg_lambda))));
 }
+
+float hardsigmoid(float x) {
+  return mix(float(x >= 0.0), x / 6 + 0.5, float(abs(x) <= 3.0));
+}
+
+vec4 hardsigmoid(vec4 tex) {
+  return vec4(
+      hardsigmoid(tex.x),
+      hardsigmoid(tex.y),
+      hardsigmoid(tex.z),
+      hardsigmoid(tex.w));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl
deleted file mode 100644
index dbc87eb7944..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-$if MAT2_IS_TRANSPOSED:
-  #define MAT2_IS_TRANSPOSED
-
-#include "indexing_utils.h"
-#include "matmul.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self;
-
-layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
-  ivec4 in_sizes;
-};
-
-layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes {
-  ivec3 self_sizes;
-};
-
-layout(set = 0, binding = 7) uniform PRECISION restrict AddmmParams {
-  float alpha;
-  float beta;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  vec4 texel = vec4(0);
-
-  $if MAT1_PACKING == "W_packed":
-    $if MAT2_PACKING == "H_packed":
-      ivec3 mat2_pos = ivec3(pos.x * 4, 0, pos.z);
-      texel = matmul_naive_W_packed_H_packed(
-          im_mat1,
-          im_mat2,
-          pos,
-          in_sizes[0]);
-    $elif MAT2_PACKING == "W_packed":
-      texel = matmul_naive_W_packed_W_packed(
-          im_mat1,
-          im_mat2,
-          pos,
-          in_sizes[0]);
-    $else:
-      $raise Exception("Unsupported value for MAT2_PACKING")
-  $else:
-    $raise Exception("Unsupported value combo for MAT1_PACKING and MAT2_PACKING")
-
-  vec4 self_texel = get_texel_W_packed(
-      im_self,
-      pos,
-      self_sizes.x == 1,
-      self_sizes.y == 1);
-
-  texel = beta * self_texel + alpha * texel;
-  imageStore(im_out, pos, texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl
new file mode 100644
index 00000000000..3d9bf885df6
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+$if MAT2_IS_TRANSPOSED:
+  #define MAT2_IS_TRANSPOSED
+
+$if HAS_BIAS:
+  #define HAS_BIAS
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")}
+$if HAS_BIAS:
+  ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+${layout_declare_ubo(B, "ivec3", "out_limits")}
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
+${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
+${layout_declare_ubo(B, "ivec4", "mat1_axis_map")}
+${layout_declare_ubo(B, "ivec4", "mat2_sizes")}
+${layout_declare_ubo(B, "ivec4", "mat2_axis_map")}
+$if HAS_BIAS:
+  ${layout_declare_ubo(B, "ivec4", "bias_sizes")}
+  ${layout_declare_ubo(B, "ivec4", "bias_axis_map")}
+  ${layout_declare_ubo(B, "float", "alpha", "float", "beta")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int out_packed_dim = C_DIM;
+layout(constant_id = 4) const int mat1_packed_dim = W_DIM;
+layout(constant_id = 5) const int mat2_packed_dim = H_DIM;
+layout(constant_id = 6) const int bias_packed_dim = W_DIM;
+
+#ifdef HAS_BIAS
+vec4 get_bias_texel_W_packed(ivec3 logical_pos) {
+  ivec3 bias_pos = ivec3(0);
+  if (bias_sizes.y == 1) {
+    bias_pos[bias_axis_map.y] = 0;
+  } else {
+    bias_pos[bias_axis_map.y] = logical_pos.y;
+  }
+  if (bias_sizes.x == 1) {
+    bias_pos[bias_axis_map.x] = 0;
+    vec4 bias_texel = texelFetch(bias_tensor, bias_pos, 0);
+    // Only the first value is valid, the rest is 0 padding
+    return vec4(bias_texel.x);
+  } else {
+    bias_pos[bias_axis_map.x] = logical_pos.x;
+  }
+
+  return texelFetch(bias_tensor, bias_pos, 0);
+}
+#endif // HAS_BIAS
+
+vec4 matmul_naive_k_dim_packed(const ivec3 out_lpos) {
+  ivec3 mat1_pos;
+  mat1_pos[mat1_axis_map.x] = 0;
+  mat1_pos[mat1_axis_map.y] = out_lpos.y;
+  mat1_pos[mat1_axis_map.z] = out_lpos.z;
+#ifdef MAT2_IS_TRANSPOSED
+  const int mat2_k_axis = mat2_axis_map.x;
+  const int mat2_row_axis = mat2_axis_map.y;
+#else
+  const int mat2_k_axis = mat2_axis_map.y;
+  const int mat2_row_axis = mat2_axis_map.x;
+#endif // MAT2_IS_TRANSPOSED
+
+  vec4 texel = vec4(0);
+  const int K = divup4(mat1_sizes.x);
+
+  for (int i = 0; i < K; ++i) {
+    const vec4 mat1_tex = texelFetch(mat1_tensor, mat1_pos, 0);
+
+    vec4 sums;
+    for (int r = 0; r < 4; ++r) {
+      // On-demand construction of mat2_pos appears to provide the lowest
+      // latency. Surprisingly, this doesn't translate to mat1_pos.
+      ivec3 mat2_pos = ivec3(0);
+      mat2_pos[mat2_k_axis] = i;
+      mat2_pos[mat2_row_axis] = out_lpos.x * 4 + r;
+#ifndef MAT2_IS_TRANSPOSED
+      mat2_pos[mat2_axis_map.z] = out_lpos.z;
+#endif // MAT2_IS_TRANSPOSED
+      sums[r] = dot(mat1_tex, texelFetch(mat2_tensor, mat2_pos, 0));
+    }
+
+    texel += sums;
+
+    mat1_pos[mat1_axis_map.x]++;
+  }
+
+  return texel;
+}
+
+vec4 matmul_naive_k_dim_packed_row_dim_packed(const ivec3 out_lpos) {
+  ivec3 mat1_pos;
+  mat1_pos[mat1_axis_map.x] = 0;
+  mat1_pos[mat1_axis_map.y] = out_lpos.y;
+  mat1_pos[mat1_axis_map.z] = out_lpos.z;
+
+  ivec3 mat2_pos;
+  mat2_pos[mat2_axis_map.x] = out_lpos.x;
+  mat2_pos[mat2_axis_map.y] = 0;
+  mat2_pos[mat2_axis_map.z] = out_lpos.z;
+
+  ivec3 mat2_pos_offset = ivec3(0);
+  mat2_pos_offset[mat2_axis_map.y] = 1;
+
+  const int mat2_y_axis = mat2_axis_map.y;
+
+  vec4 texel = vec4(0);
+  const int K = divup4(mat1_sizes.x);
+
+  for (int i = 0;
+       i < K;
+       ++i, mat1_pos[mat1_axis_map.x]++, mat2_pos[mat2_axis_map.y]+=4) {
+    const vec4 mat1_tex = texelFetch(mat1_tensor, mat1_pos, 0);
+
+    for (int r = 0; r < 4; ++r) {
+      // On-demand construction of mat2_pos appears to provide the lowest
+      // latency. Surprisingly, this doesn't translate to mat1_pos.
+      ivec3 mat2_pos = ivec3(0);
+      mat2_pos[mat2_axis_map.x] = out_lpos.x;
+      mat2_pos[mat2_axis_map.y] = 4 * i + r;
+      mat2_pos[mat2_axis_map.z] = out_lpos.z;
+
+      vec4 mat1_comp_vec = vec4(mat1_tex[r]);
+      texel = fma(mat1_comp_vec, texelFetch(mat2_tensor, mat2_pos, 0), texel);
+    }
+  }
+
+  return texel;
+}
+
+void main() {
+  const ivec3 out_lpos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThanEqual(out_lpos, out_limits))) {
+    return;
+  }
+
+  vec4 texel = vec4(0);
+
+#ifdef MAT2_IS_TRANSPOSED
+  if (mat2_packed_dim == W_DIM) {
+    texel = matmul_naive_k_dim_packed(out_lpos);
+  } else {
+    texel = matmul_naive_k_dim_packed_row_dim_packed(out_lpos);
+  }
+#else
+  if (mat2_packed_dim == W_DIM) {
+    texel = matmul_naive_k_dim_packed_row_dim_packed(out_lpos);
+  } else {
+    texel = matmul_naive_k_dim_packed(out_lpos);
+  }
+#endif // MAT2_IS_TRANSPOSED
+
+#ifdef HAS_BIAS
+  vec4 bias_texel = get_bias_texel_W_packed(out_lpos);
+  texel = beta * bias_texel + alpha * texel;
+#endif // HAS_BIAS
+
+  write_texel_lpos(out_tensor, out_lpos, texel, out_axis_map);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml
similarity index 61%
rename from backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml
index 48db85cb56e..33b617eed13 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml
@@ -4,21 +4,21 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-addmm_naive:
+addmm_naive_texture3d:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
-    MAT1_PACKING: W_packed
-    MAT2_PACKING: H_packed
     MAT2_IS_TRANSPOSED: false
+    HAS_BIAS: true
   generate_variant_forall:
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
-    - NAME: addmm_naive_W_packed_H_packed
-    - NAME: addmm_naive_W_packed_W_packed
-      MAT2_PACKING: W_packed
-    - NAME: linear_naive_W_packed_W_packed
-      MAT2_PACKING: W_packed
+    - NAME: addmm_naive_texture3d
+    - NAME: matmul_naive_texture3d
+      HAS_BIAS: false
+    - NAME: linear_naive_texture3d
       MAT2_IS_TRANSPOSED: true
+    - NAME: matmul_transposed_naive_texture3d
+      MAT2_IS_TRANSPOSED: true
+      HAS_BIAS: false
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
index 1698efb0b15..ad794d6db49 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
@@ -16,90 +16,219 @@ $if MAT2_IS_TRANSPOSED:
 $if BATCH_MODE:
   #define BATCH_MODE
 
-$if TILE_ROW == "tile_row_2":
-  #define TILE_ROW_2
+$if HAS_BIAS:
+  #define HAS_BIAS
 
 #include "indexing_utils.h"
-#include "matmul.h"
 
-// addmm will have additional arguments compared to regular mm
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self;
+${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")}
+$if HAS_BIAS:
+  ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
+${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
+${layout_declare_ubo(B, "ivec4", "mat1_axis_map")}
+${layout_declare_ubo(B, "ivec4", "mat2_sizes")}
+${layout_declare_ubo(B, "ivec4", "mat2_axis_map")}
+$if HAS_BIAS:
+  ${layout_declare_ubo(B, "ivec4", "bias_sizes")}
+  ${layout_declare_ubo(B, "ivec4", "bias_axis_map")}
+  ${layout_declare_ubo(B, "float", "alpha", "float", "beta")}
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(set = 0, binding = 5) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
-};
+layout(constant_id = 3) const int out_packed_dim = C_DIM;
 
-layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes {
-  ivec4 self_sizes;
-};
+// To convince the SPIR-V compiler to unroll the loops optimally, need this
+// macro
+#define FOUR 4
 
-layout(set = 0, binding = 7) uniform PRECISION restrict InLimits {
-  ivec3 in_limits;
+#define TILE_ROWS ${TILE_ROWS}
+
+// we avoid mat4 and vec4 usage here as they compile to much less efficient
+// SPIR-V
+struct FloatMatrix_2d {
+  float data[TILE_ROWS][FOUR];
 };
 
-layout(set = 0, binding = 8) uniform PRECISION restrict Params {
-  float alpha;
-  float beta;
+struct FloatMatrix_3d {
+  float data[TILE_ROWS][FOUR][FOUR];
 };
 
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+#ifdef BATCH_MODE
+  #define FloatMatrix FloatMatrix_3d
+#else
+  #define FloatMatrix FloatMatrix_2d
+#endif // BATCH_MODE
+
+#ifdef HAS_BIAS
+// get texel from self tensor (channel_packed) in addmm
+vec4 get_texel_C_packed(const ivec2 idx) {
+  ivec3 bias_pos = ivec3(0);
+  if (bias_sizes.x > 1) {
+    bias_pos[bias_axis_map.x] = idx.x;
+  }
+  if (bias_sizes.y > 1) {
+    bias_pos[bias_axis_map.y] = idx.y;
+  }
 
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  return texelFetch(bias_tensor, bias_pos, 0);
+}
+#endif // HAS_BIAS
+
+FloatMatrix matmul_partial(const ivec4 out_idx_tl) {
+  FloatMatrix results;
+  for (int i = 0; i < TILE_ROWS; i++) {
+    for (int j = 0; j < FOUR; j++) {
+#ifdef BATCH_MODE
+      for (int k = 0; k < FOUR; k++) {
+        results.data[i][j][k] = 0.0f;
+      }
+#else
+      results.data[i][j] = 0.0f;
+#endif // BATCH_MODE
+    }
+  }
+  vec4 mat1_tensor_partial_load[TILE_ROWS];
+  vec4 mat2_tensor_partial_load[FOUR];
+
+#ifdef MAT2_IS_TRANSPOSED
+  const int mat2_k_axis = mat2_axis_map.x;
+  const int mat2_row_axis = mat2_axis_map.y;
+#else
+  const int mat2_k_axis = mat2_axis_map.y;
+  const int mat2_row_axis = mat2_axis_map.x;
+#endif // MAT2_IS_TRANSPOSED
+
+#ifdef BATCH_MODE
+  for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) {
+    if (out_idx_tl.z + batch_idx >= out_sizes.z) {
+      break;
+    }
+#endif // BATCH_MODE
+  for (int k = 0; k < mat1_sizes.x; k+=4) {
+    const int k_div4 = k >> 2;
+    // read and cache (4 x TILE_ROWS) tile of mat1
+    for (int r = 0; r < TILE_ROWS; r++) {
+      ivec3 mat1_pos = ivec3(0);
+      mat1_pos[mat1_axis_map.x] = k_div4;
+      mat1_pos[mat1_axis_map.y] = out_idx_tl.y + r;
+#ifdef BATCH_MODE
+      mat1_pos[mat1_axis_map.z] = out_idx_tl.z + batch_idx;
+#endif // BATCH_MODE
+
+      mat1_tensor_partial_load[r] = texelFetch(mat1_tensor, mat1_pos, 0);
+    }
 
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
+    // read and cache (4 x 4) tile of mat2
+    for (int r = 0; r < FOUR; ++r) {
+      ivec3 mat2_pos = ivec3(0);
+      mat2_pos[mat2_k_axis] = k_div4;
+      mat2_pos[mat2_row_axis] = out_idx_tl.x + r;
+#if defined(BATCH_MODE) && !defined(MAT2_IS_TRANSPOSED)
+      mat2_pos[mat2_axis_map.z] = out_idx_tl.z + batch_idx;
+#endif // BATCH_MODE
+
+      mat2_tensor_partial_load[r] = texelFetch(mat2_tensor, mat2_pos, 0);
+    }
+
+    // perform partial dot products and add partial result to results
+    for (int out_row = 0; out_row < TILE_ROWS; out_row++) {
+      for (int out_col = 0; out_col < FOUR; out_col++) {
+#ifdef BATCH_MODE
+        results.data[out_row][out_col][batch_idx] +=
+#else
+        results.data[out_row][out_col] +=
+#endif // BATCH_MODE
+            dot(mat1_tensor_partial_load[out_row], mat2_tensor_partial_load[out_col]);
+      }
+    }
   }
+#ifdef BATCH_MODE
+  }
+#endif // BATCH_MODE
+
+  return results;
+}
 
-  $if BATCH_MODE:
-    FloatMatrix_3d results = matmul_partial_3d(
-      im_mat1,
-      im_mat2,
-      pos,
-      out_sizes[2],
-      in_limits[0]);
-  $else:
-    FloatMatrix_2d results = matmul_partial_2d(
-        im_mat1,
-        im_mat2,
-        pos,
-        out_sizes[2],
-        in_limits[0]);
-
-  for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) {
-    for (int idx_r = 0; idx_r < FOUR; idx_r++) {
-      const ivec3 out_pos =
-          ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z);
-
-      vec4 self_texel = get_texel_C_packed(
-          im_self,
-          out_pos,
-          self_sizes.x == 1,
-          self_sizes.y == 1);
-
-      // results is in transposed order w.r.t. the desired output
-      $if BATCH_MODE:
-        imageStore(
-          im_out,
-          out_pos,
-          vec4(
-              beta * self_texel.x + alpha * results.data[idx_c][idx_r][0],
-              beta * self_texel.x + alpha * results.data[idx_c][idx_r][1],
-              beta * self_texel.x + alpha * results.data[idx_c][idx_r][2],
-              beta * self_texel.x + alpha * results.data[idx_c][idx_r][3]));
-      $else:
-        imageStore(
-            im_out,
-            out_pos,
-            vec4(
-                beta * self_texel.x + alpha * results.data[idx_c][idx_r], 0.0, 0.0, 0.0));
+//
+// Write result matrix to output (3D matmul)
+//
+
+void write_results_C_packed(const ivec4 out_idx_tl, FloatMatrix results) {
+  ivec3 out_pos = tidx_to_pos(
+      out_idx_tl, out_sizes, out_axis_map, out_packed_dim);
+
+  for (int tile_c = 0;
+       tile_c < TILE_ROWS;
+       tile_c++, out_pos[out_axis_map.y]++) {
+    out_pos[out_axis_map.x] = out_idx_tl.x;
+
+    for (int tile_r = 0;
+         tile_r < FOUR;
+         tile_r++, out_pos[out_axis_map.x]++) {
+
+#ifdef HAS_BIAS
+      ivec2 bias_idx;
+      bias_idx[bias_axis_map.x] = out_pos[out_axis_map.x];
+      bias_idx[bias_axis_map.y] = out_pos[out_axis_map.y];
+      float bias_val = get_texel_C_packed(bias_idx).x;
+#ifdef BATCH_MODE
+      vec4 bias_texel = vec4(bias_val);
+#else
+      vec4 bias_texel = vec4(bias_val, 0, 0, 0);
+#endif // BATCH_MODE
+#endif // HAS_BIAS
+
+#ifdef BATCH_MODE
+      vec4 out_texel = vec4(
+            results.data[tile_c][tile_r][0],
+            results.data[tile_c][tile_r][1],
+            results.data[tile_c][tile_r][2],
+            results.data[tile_c][tile_r][3]);
+#else
+      vec4 out_texel = vec4(
+            results.data[tile_c][tile_r],
+            0.0,
+            0.0,
+            0.0);
+#endif // BATCH_MODE
+
+#ifdef HAS_BIAS
+      imageStore(out_tensor, out_pos, beta * bias_texel + alpha * out_texel);
+#else
+      imageStore(out_tensor, out_pos, out_texel);
+#endif // HAS_BIAS
     }
   }
 }
+
+void main() {
+  // Each thread is responsible for calculating a (4 x TILE_ROWS x 1) tile of
+  // output elements. If the input matrices are 3D, then a (4 x TILE_ROWS x 4)
+  // tile of output elements will be computed. Note the sizes are written in
+  // (W x H x C) format.
+  const ivec3 tile_idx = ivec3(gl_GlobalInvocationID);
+
+  // Calculate the tensor index of the top left element in the output tile
+  const ivec4 out_idx_topleft = ivec4(
+      tile_idx.x * 4,
+      tile_idx.y * TILE_ROWS,
+#ifdef BATCH_MODE
+      tile_idx.z * 4,
+#else
+      tile_idx.z,
+#endif // BATCH_MODE
+      0);
+
+  // If the top left element is already out of range, then skip
+  if (any(greaterThanEqual(out_idx_topleft, out_sizes))) {
+    return;
+  }
+
+  FloatMatrix results = matmul_partial(out_idx_topleft);
+
+  write_results_C_packed(out_idx_topleft, results);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
index b958d3b9543..c82c2003d20 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
@@ -7,24 +7,37 @@
 addmm_optimized:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
-    PACKING: C_packed
     MAT2_IS_TRANSPOSED: false
     BATCH_MODE: false
-    TILE_ROW: tile_row_4
+    TILE_ROWS: 4
+    HAS_BIAS: true
   generate_variant_forall:
-    TILE_ROW:
-      - VALUE: tile_row_4
-      - VALUE: tile_row_2
+    TILE_ROWS:
+      - VALUE: 4
+        SUFFIX: tile_row_4
+      - VALUE: 2
+        SUFFIX: tile_row_2
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
     - NAME: addmm_optimized
+    - NAME: matmul_optimized
+      HAS_BIAS: false
     - NAME: linear_optimized
       MAT2_IS_TRANSPOSED: true
+    - NAME: matmul_transposed_optimized
+      MAT2_IS_TRANSPOSED: true
+      HAS_BIAS: false
     - NAME: batch_addmm_optimized
       BATCH_MODE: true
+    - NAME: batch_matmul_optimized
+      BATCH_MODE: true
+      HAS_BIAS: false
     - NAME: batch_linear_optimized
       MAT2_IS_TRANSPOSED: true
       BATCH_MODE: true
+    - NAME: batch_matmul_transposed_optimized
+      MAT2_IS_TRANSPOSED: true
+      BATCH_MODE: true
+      HAS_BIAS: false
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
index ec7e1da296c..3103c92aea1 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -19,38 +19,43 @@
 
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_tensor(2, "r", "t_other", DTYPE, STORAGE)}
-${layout_declare_ubo(3, "ivec4", "out_sizes")}
-${layout_declare_ubo(4, "ivec4", "in_sizes")}
-${layout_declare_ubo(5, "ivec4", "other_sizes")}
-${layout_declare_ubo(6, "ivec2", "broadcast_params")}
-${layout_declare_ubo(7, "float", "alpha")}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
+${layout_declare_ubo(B, "ivec4", "in_sizes")}
+${layout_declare_ubo(B, "ivec4", "in_axis_map")}
+${layout_declare_ubo(B, "ivec4", "other_sizes")}
+${layout_declare_ubo(B, "ivec4", "other_axis_map")}
+${layout_declare_ubo(B, "ivec2", "broadcast_params")}
+${layout_declare_ubo(B, "float", "alpha")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
 void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  const ivec4 tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, packed_dim);
 
-  if (any(greaterThanEqual(idx, out_sizes))) {
+  if (any(greaterThanEqual(tidx, out_sizes))) {
     return;
   }
 
-  ivec4 in_idx = broadcast_indices(idx, in_sizes);
-  VEC4_T in_texel = VEC4_T(texelFetch(
+  // broadcast on logical sizes
+  ivec4 in_idx = broadcast_indices(tidx, in_sizes);
+  VEC4_T in_texel = VEC4_T(load_texel(
     t_in,
-    to_texture_pos(in_idx, in_sizes, packed_dim),
-    0));
+    // read axis mapped texel
+    tidx_to_pos(in_idx, in_sizes, in_axis_map, packed_dim)));
 
-  ivec4 other_idx = broadcast_indices(idx, other_sizes);
-  VEC4_T other_texel = VEC4_T(texelFetch(
+  // broadcast on logical sizes
+  ivec4 other_idx = broadcast_indices(tidx, other_sizes);
+  VEC4_T other_texel = VEC4_T(load_texel(
     t_other,
-    to_texture_pos(other_idx, other_sizes, packed_dim),
-    0));
+    // read axis mapped texel
+    tidx_to_pos(other_idx, other_sizes, other_axis_map, packed_dim)));
 
   // Check boolean broadcast flags; we use ivec2 instead of bvec2 for alignment.
   if (broadcast_params.x > 0) {
@@ -60,5 +65,9 @@ void main() {
     other_texel = other_texel.xxxx;
   }
 
-  imageStore(t_out, pos, VEC4_T(op(in_texel, other_texel, alpha)));
+  write_texel_lpos(
+    t_out,
+    lpos,
+    VEC4_T(op(in_texel, other_texel, alpha)),
+    out_axis_map);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
index 58796879e85..201b4d17262 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
@@ -23,13 +23,13 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM;
 
 void main() {
-  int out_id = int(gl_GlobalInvocationID.x);
-  if (out_id >= numel) {
+  int nchwi = int(gl_GlobalInvocationID.x);
+  if (nchwi >= numel) {
     return;
   }
 
-  ivec4 t_in_idx = from_nchw_buffer_i(out_id, in_sizes);
-  const int in_id = to_buffer_id(t_in_idx, in_strides);
+  ivec4 in_tidx = nchwi_to_tidx(nchwi, in_sizes);
+  const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
 
-  nchw_buf[out_id] = t_in[in_id];
+  nchw_buf[nchwi] = t_in[in_bufi];
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
index 35f8e25fc25..fe6d7ba7a96 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
@@ -18,32 +18,22 @@
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-layout(set = 0, binding = 2) uniform PRECISION sampler3D kernel_in;
-layout(set = 0, binding = 3) uniform PRECISION sampler3D bias_in;
-
-layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
-  ivec4 in_sizes;
-};
-
-layout(set = 0, binding = 6) uniform PRECISION restrict Params {
-  int kernel_size;
-  int stride;
-  int padding;
-  int dilation;
-  int in_group_size;
-  int out_group_size;
-};
-
-layout(set = 0, binding = 7) uniform PRECISION restrict OutputParams {
-  float out_min;
-  float out_max;
-};
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "kernel_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "bias_in", DTYPE, STORAGE)}
+
+${layout_declare_ubo(B, "ivec3", "out_limits")}
+${layout_declare_ubo(B, "ivec4", "in_sizes")}
+
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
+${layout_declare_ubo(B, "ivec4", "in_axis_map")}
+${layout_declare_ubo(B, "ivec4", "kernel_axis_map")}
+${layout_declare_ubo(B, "ivec4", "bias_axis_map")}
+
+${layout_declare_ubo(B,"int", "kernel_size", "int", "stride", "int", "padding", "int", "dilation", "int", "in_group_size", "int", "out_group_size")}
+
+${layout_declare_ubo(B, "float", "out_min", "float", "out_max")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -67,9 +57,9 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 // shader invocations, where each invocation computes 1 result. But that
 // performs worse.
 void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
 
-  if (any(greaterThanEqual(pos, out_limits))) {
+  if (any(greaterThanEqual(lpos, out_limits))) {
     return;
   }
 
@@ -78,8 +68,8 @@ void main() {
 
   // "out_c" is the output's channel index where we write our result.
   // Across shader invocations, this is the only value that varies.
-  int out_c = pos.y;
-  vec4 bias = texelFetch(bias_in, ivec3(out_c, 0, 0), 0);
+  int out_c = lpos.y;
+  VEC4_T bias = load_texel_lpos(bias_in, ivec3(out_c, 0, 0), bias_axis_map);
 
   // "in_c" tracks the input's channel start index.
   // We iterate over the input group that corresponds to the output group.
@@ -98,7 +88,7 @@ void main() {
     int out_l = 0;
 
     for (int in_l = l_start; in_l < l_end; in_l += stride, ++out_l) {
-      vec4 sum = vec4(0);
+      VEC4_T sum = VEC4_T(0);
 
       for (int in_c = c_start; in_c < c_end; ++in_c) {
         // "k" tracks the kernel's index for our input-kernel computation.
@@ -107,25 +97,25 @@ void main() {
         for (int k = 0; k < kernel_size; k += 4) {
           // Since the weight tensor is width-packed, which is along the length
           // dimension, we can batch-read four elements at a time.
-          const ivec3 w_pos = ivec3(k / 4, in_c % in_group_size, out_c);
-          const vec4 weight = texelFetch(kernel_in, w_pos, 0);
+          const ivec3 w_lpos = ivec3(k / 4, in_c % in_group_size, out_c);
+          const VEC4_T weight = load_texel_lpos(kernel_in, w_lpos, kernel_axis_map);
 
-          const ivec3 in_pos_0 = ivec3(in_l + k * dilation, in_c, n / 4);
-          sum = fma(weight.xxxx, texelFetch(image_in, in_pos_0, 0), sum);
+          ivec3 in_pos = lpos_to_pos(ivec3(in_l + k * dilation, in_c, n / 4), in_axis_map);
+          sum = fma(weight.xxxx, load_texel(t_in, in_pos), sum);
 
-          const ivec3 in_pos_1 = ivec3(in_l + (k+1) * dilation, in_c, n / 4);
-          sum = fma(weight.yyyy, texelFetch(image_in, in_pos_1, 0), sum);
+          in_pos[in_axis_map.x] += dilation;
+          sum = fma(weight.yyyy, load_texel(t_in, in_pos), sum);
 
-          const ivec3 in_pos_2 = ivec3(in_l + (k+2) * dilation, in_c, n / 4);
-          sum = fma(weight.zzzz, texelFetch(image_in, in_pos_2, 0), sum);
+          in_pos[in_axis_map.x] += dilation;
+          sum = fma(weight.zzzz, load_texel(t_in, in_pos), sum);
 
-          const ivec3 in_pos_3 = ivec3(in_l + (k+3) * dilation, in_c, n / 4);
-          sum = fma(weight.wwww, texelFetch(image_in, in_pos_3, 0), sum);
+          in_pos[in_axis_map.x] += dilation;
+          sum = fma(weight.wwww, load_texel(t_in, in_pos), sum);
         }
       }
 
-      ivec3 out_pos = ivec3(out_l, out_c, n / 4);
-      imageStore(image_out, out_pos, op(sum + bias.x, out_min, out_max));
+      const ivec3 out_lpos = ivec3(out_l, out_c, n / 4);
+      write_texel_lpos(t_out, out_lpos, op(sum + bias.x, out_min, out_max), out_axis_map);
     }
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml
index 16c4172510c..2266649d2b9 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml
@@ -7,9 +7,8 @@
 conv1d:
   parameter_names_with_default_values:
     OPERATOR: X
-    NDIM: 3
     DTYPE: float
-    PACKING: C_packed
+    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: half
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
index 18202e4a51f..49ce76423d5 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
@@ -53,7 +53,7 @@ void main() {
   }
 
   // Map tensor_idx to normal buffer_i
-  const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
+  const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim);
 
   // Compute modified tensor_idx by inverting the CPU function
   const int N = original_sizes.w;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
index 493a614ee81..4e8bff94947 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
@@ -53,7 +53,7 @@ void main() {
   }
 
   // Map tensor_idx to normal buffer_i
-  const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
+  const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim);
 
   // Compute modified tensor_idx by inverting the CPU function
   const int N = original_sizes.w;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
index d2978ffe7e6..df8589e737f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
@@ -53,7 +53,7 @@ void main() {
   }
 
   // Map tensor_idx to normal buffer_i
-  const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
+  const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim);
 
   // Compute modified tensor_idx by inverting the CPU function
   const int N = original_sizes.w;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
index 50ddb92c349..d709578b1c9 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
@@ -10,19 +10,16 @@
 
 #define PRECISION ${PRECISION}
 
+#include "indexing_utils.h"
+
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 
-layout(set = 0, binding = 2) uniform PRECISION restrict CopyArgs {
-  ivec3 range;
-  int unused0;
-  ivec3 src_offset;
-  int unused1;
-  ivec3 dst_offset;
-  int unused2;
-};
+${layout_declare_ubo(B, "ivec3", "range", "ivec3", "src_offset", "ivec3", "dst_offset")}
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
+${layout_declare_ubo(B, "ivec4", "in_axis_map")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -36,5 +33,9 @@ void main() {
     return;
   }
 
-  imageStore(t_out, out_pos, texelFetch(t_in, in_pos, 0));
+  write_texel_lpos(
+    t_out,
+    out_pos,
+    load_texel_lpos(t_in, in_pos, in_axis_map),
+    out_axis_map);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
index 3adffe99bdb..0a3eeee257f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
@@ -16,34 +16,36 @@ layout(std430) buffer;
 
 #include "indexing_utils.h"
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", "int", STORAGE)}
-${layout_declare_tensor(2, "r", "t_weight", DTYPE, STORAGE)}
-${layout_declare_ubo(3, "ivec4", "sizes")}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", "int", STORAGE)}
+${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
+${layout_declare_ubo(B, "ivec4", "in_axis_map")}
+${layout_declare_ubo(B, "ivec4", "weight_axis_map")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
 void main() {
-  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
-
-  if (pos_out_of_bounds(out_pos, sizes, packed_dim)) {
+  const ivec3 out_lpos = ivec3(gl_GlobalInvocationID);
+  const ivec4 out_tidx = lpos_to_tidx(out_lpos, sizes, out_axis_map.w, packed_dim);
+  if (any(greaterThanEqual(out_tidx, sizes))) {
     return;
   }
-
-  const ivec4 out_idx = to_tensor_idx(out_pos, sizes, packed_dim);
   VEC4_T out_texel;
 
   // Consider optimizing via W-packing format for t_in and t_weight.
   for (int i = 0; i < 4; ++i) {
     // Read input tensor for embedding index.
-    const ivec3 in_pos = ivec3(out_pos.y, out_idx.z * 4 + i, out_idx.w / 4);
-    const int in_texel_elem = texelFetch(t_in, in_pos, 0)[out_idx.w % 4];
+    const ivec3 in_lpos = ivec3(out_tidx.y, out_tidx.z * 4 + i, out_tidx.w / 4);
+    const int in_texel_elem = load_texel_lpos(t_in, in_lpos, in_axis_map)[out_tidx.w % 4];
 
     // Read weight tensor for embedding.
-    out_texel[i] = texelFetch(t_weight, ivec3(out_pos.x, in_texel_elem, 0), 0).x;
+    const ivec3 weight_lpos = ivec3(out_tidx.x, in_texel_elem, 0);
+    out_texel[i] = load_texel_lpos(t_weight, weight_lpos, weight_axis_map).x;
   }
 
-  imageStore(t_out, out_pos, out_texel);
+  write_texel_lpos(t_out, out_lpos, out_texel, out_axis_map);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
index b51d5a3f6ed..be3901799f8 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
@@ -21,16 +21,17 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_buffer(0, "w", "nchw_out", DTYPE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "sizes")}
+${layout_declare_buffer(B, "w", "nchw_out", DTYPE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_map")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
 void write_out_texel(VEC4_T texel, ivec4 tensor_idx) {
-  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
+  const ivec4 buf_indices = tidx_to_nchwi(
       tensor_idx,
       sizes,
       packed_dim);
@@ -50,13 +51,13 @@ void write_out_texel(VEC4_T texel, ivec4 tensor_idx) {
 }
 
 void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim);
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim);
 
-  if (any(greaterThanEqual(tensor_idx, sizes))) {
+  if (any(greaterThanEqual(tidx, sizes))) {
     return;
   }
 
-  const VEC4_T intex = load_texel(t_in, pos);
-  write_out_texel(intex, tensor_idx);
+  const VEC4_T intex = load_texel(t_in, lpos_to_pos(lpos, axis_map));
+  write_out_texel(intex, tidx);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl
index ba60000f3d4..76ec540838c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl
@@ -34,18 +34,18 @@ void main() {
   }
 
   const ivec4 idx = to_tensor_idx(out_pos, out_sizes, packed_dim);
-  const ivec4 buffer_ixs = get_texel_nchw_buffer_ixs(idx, out_sizes, packed_dim);
+  const ivec4 buffer_ixs = tidx_to_nchwi(idx, out_sizes, packed_dim);
 
   VEC4_T out_texel;
   for (int i = 0; i < 4; ++i) {
-      const ivec4 out_idx = from_nchw_buffer_i(buffer_ixs[i], out_sizes);
-      int out_channel = out_idx.z;
+      const ivec4 out_tidx = nchwi_to_tidx(buffer_ixs[i], out_sizes);
+      int out_channel = out_tidx.z;
       int in_channel = texelFetch(t_idx, ivec3(out_channel, 0, 0), 0).x;
 
-      ivec4 in_idx = out_idx;
-      in_idx.z = in_channel;
+      ivec4 in_tidx = out_tidx;
+      in_tidx.z = in_channel;
 
-      ivec4 in_elem_pos = to_texture_elem_pos(in_idx, in_sizes, packed_dim);
+      ivec4 in_elem_pos = to_texture_elem_pos(in_tidx, in_sizes, packed_dim);
 
       VEC4_T in_texel = texelFetch(t_in, in_elem_pos.xyz, 0);
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
index 21eadff0b36..cf6686ee08c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -9,32 +9,44 @@
 #ifndef INDEXING_UTILS_H
 #define INDEXING_UTILS_H
 
-// Width Dim Index, assuming (W, H, C, N) order
+/*
+ * The functions defined in this header file use the following shorthand to
+ * represent tensor related data structures.
+ *
+ * tidx  - ivec4 tensor indices, listed in WHCN order.
+ *
+ * pos   - ivec3 texel position, used to fetch from an image texture via the
+ *         texelFetch(image, pos, lod) GLSL function.
+ * posi  - ivec4 texel element position. It is the same as pos, except with an
+ *         additional component of the index of an element within the texel.
+ * lpos  - ivec3 logical position, listed in WHC order. This is a permutation of
+ *         texture position based on a tensor's axis_map. lpos.x is the position
+ *         component that corresponds to the tensor's width dimension, lpos.y is
+ *         the position component that corresponds to the tensor's height dim,
+ *         and so on.
+ *
+ * bufi  - int index into a GPU buffer that backs a tensor.
+ * nchwi - int index into a staging buffer for a tensor. The data in the
+ *         staging buffer is stored in contiguous data layout, irrespective of
+ *         the tensor's strides.
+ */
+
+// Width Dim Index, assuming WHCN order
 #define W_DIM 0
-// Height, assuming (W, H, C, N) order
+// Height, assuming WHCN order
 #define H_DIM 1
-// Channels, assuming (W, H, C, N) order
+// Channels, assuming WHCN order
 #define C_DIM 2
 
 /*
- * Describes which texture axis the "batches" dimension runs along in a 4D
- * texture.
- *
- * Currently it is set to 2 since we represent batches by concatenating along
- * the channels dim, which has index 2 in (W, H, C, N) order and maps to the
- * depth dimension of a texture, which also corresponds to index 2 in (x, y, z)
- * order.
+ * Fast division by 4 using bit shifting
  */
-#define BATCH_AXIS 2
-
-//
-// Basic Indexing Utility Macros and Functions
-//
+#define div4(x) (x >> 2)
 
 /*
  * Divides input and rounds up to 4
  */
-#define divup4(x) ((x + 3) / 4)
+#define divup4(x) ((x + 3) >> 2)
 
 /*
  * Aligns input to the next multiple of 4
@@ -42,8 +54,8 @@
 #define alignup4(x) ((x + 3) & -4)
 
 /*
- * Input: (W, H, C, N) strides of a tensor
- * Returns: the WHCN index of the fastest moving dimension
+ * Find the packed dimension of a tensor given its strides. The packed dimension
+ * is the "fastest moving" dimension which will have a stride of 1.
  */
 int find_packed_dim(const ivec4 strides) {
   int packed_dim = 0;
@@ -56,83 +68,179 @@ int find_packed_dim(const ivec4 strides) {
   return packed_dim;
 }
 
-//
-// (w, h, c, n) Tensor Index <-> Contiguous Buffer Index Conversion
-//
-
 /*
- * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
- *        is packed along a texel
- * Output: A ivec4 containing the buffer indices corresponding to each texel
- *         element.
+ * Get the staging buffer indices that contain the data of the texel that
+ * corresponds to the provided tensor index. Since the texel have 4 elements,
+ * 4 buffer indices will be retrieved.
  */
-ivec4 get_texel_nchw_buffer_ixs(ivec4 idx, ivec4 sizes, int packed_dim) {
+ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) {
   ivec4 strides =
       ivec4(1, sizes.x, sizes.x * sizes.y, sizes.x * sizes.y * sizes.z);
 
-  int base_i = idx.x * strides.x + idx.y * strides.y + idx.z * strides.z +
-      idx.w * strides.w;
+  int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
+      tidx.w * strides.w;
 
   return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
 }
 
-/*
- * Input: Index into a tensor's data buffer, (W, H, C, N) sizes of a tensor
- * Returns: The WCHN index of the tensor that corresponds to the specified
- *          buffer index, assuming the buffer has contiguous memory layout
- */
-ivec4 from_nchw_buffer_i(int buf_i, ivec4 sizes) {
+ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) {
   return ivec4(
-      buf_i % sizes.x,
-      (buf_i / (sizes.x)) % sizes.y,
-      (buf_i / (sizes.x * sizes.y)) % sizes.z,
-      (buf_i / (sizes.x * sizes.y * sizes.z)));
+      nchwi % sizes.x,
+      (nchwi / (sizes.x)) % sizes.y,
+      (nchwi / (sizes.x * sizes.y)) % sizes.z,
+      (nchwi / (sizes.x * sizes.y * sizes.z)));
 }
 
-int to_nchw_buffer_i(const ivec4 tensor_idx, const ivec4 sizes) {
-  return tensor_idx.w * sizes.x * sizes.y * sizes.z +
-      tensor_idx.z * sizes.x * sizes.y + tensor_idx.y * sizes.x + tensor_idx.x;
+int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) {
+  return tidx.w * sizes.x * sizes.y * sizes.z + tidx.z * sizes.x * sizes.y +
+      tidx.y * sizes.x + tidx.x;
 }
 
-/*
- * Input: Texel buffer index, (W, H, C, N) strides of a tensor, which dim is
- *        packed along a texel
- * Returns: The (w, h, c, n) tensor index corresponding to the buffer element
- */
-ivec4 to_tensor_idx(int buffer_id, const ivec4 strides, const int packed_dim) {
+// TODO(ssjia): make this function use dim order so that it can work with any
+// dim order. Currently it assumes that the dim order is contiguous, except for
+// the packed dim.
+ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const int packed_dim) {
   ivec4 idx;
   for (int i = 3; i >= 0; i--) {
     if (i != packed_dim) {
-      idx[i] = buffer_id / strides[i];
-      buffer_id %= strides[i];
+      idx[i] = bufi / strides[i];
+      bufi %= strides[i];
     }
   }
-  idx[packed_dim] = buffer_id;
+  idx[packed_dim] = bufi;
   return idx;
 }
 
-/*
- * Input: Texel buffer index, (W, H, C, N) strides of a tensor
- * Returns: The (w, h, c, n) tensor index corresponding to the buffer element
- *
- * This is a convenience overload of the above function. If the packed dim is
- * not known, it can be found by finding the first dimension with a stride of 1.
- * However, this process adds some overhead, so if performance is a concern then
- * the above function should be used instead so that the packed dim is provided.
- */
-ivec4 to_tensor_idx(int buffer_id, const ivec4 strides) {
+// Convenience overload of the above function, which will determine the packed
+// dim from the strides automatically so it doesn't have to be passed in as a
+// function argument.
+ivec4 bufi_to_tidx(const int bufi, const ivec4 strides) {
   int packed_dim = find_packed_dim(strides);
-  return to_tensor_idx(buffer_id, strides, packed_dim);
+  return bufi_to_tidx(bufi, strides, packed_dim);
+}
+
+int tidx_to_bufi(const ivec4 tidx, ivec4 strides) {
+  return tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
+      tidx.w * strides.w;
+}
+
+ivec4 lpos_to_tidx(
+    ivec3 lpos,
+    ivec4 sizes,
+    const int batch_inner_dim,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+  // Moving 1 texel along the packed dim traverses 4 tensor elements
+  lpos[packed_dim] *= 4;
+
+  ivec4 tidx = ivec4(lpos, 0);
+
+  if (sizes.w > 1) {
+    tidx.w = tidx[batch_inner_dim] / sizes[batch_inner_dim];
+    tidx[batch_inner_dim] %= sizes[batch_inner_dim];
+  }
+  return tidx;
+}
+
+ivec3 tidx_to_lpos(
+    ivec4 tidx,
+    ivec4 sizes,
+    const int batch_inner_dim,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  ivec3 lpos = tidx.xyz;
+
+  // Adjust batch inner dim by batch index if needed
+  if (sizes.w > 1) {
+    lpos[batch_inner_dim] += tidx.w * sizes[batch_inner_dim];
+  }
+  // Fast division by 4, since moving 1 texel along the packed dim traverses 4
+  // tensor elements.
+  lpos[packed_dim] >>= 2;
+  return lpos;
+}
+
+ivec3 tidx_to_pos(
+    ivec4 tidx,
+    ivec4 sizes,
+    const ivec4 axis_map,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  ivec3 pos;
+  for (int dim = 0; dim < 3; ++dim) {
+    pos[axis_map[dim]] = tidx[dim];
+  }
+
+  // Adjust batch inner dim by batch index if needed
+  if (sizes.w > 1) {
+    pos[axis_map[axis_map.w]] += tidx.w * sizes[axis_map.w];
+  }
+  // Fast division by 4, since moving 1 texel along the packed dim traverses 4
+  // tensor elements.
+  pos[axis_map[packed_dim]] >>= 2;
+  return pos;
+}
+
+ivec4 tidx_to_posi(
+    ivec4 tidx,
+    ivec4 sizes,
+    const ivec4 axis_map,
+    const int packed_dim) {
+  return ivec4(
+      tidx_to_pos(tidx, sizes, axis_map, packed_dim), tidx[packed_dim] % 4);
+}
+
+ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) {
+  ivec3 pos;
+  pos[axis_map.x] = lpos.x;
+  pos[axis_map.y] = lpos.y;
+  pos[axis_map.z] = lpos.z;
+  return pos;
 }
 
+#ifdef USING_BUFFER
+#define load_texel(buf, idx) buf[idx]
+#elif defined(USING_TEXTURE2D)
+#define load_texel(im, pos) texelFetch(im, pos.xy, 0)
+#else // defined(USING_TEXTURE3D)
+#define load_texel(im, pos) texelFetch(im, pos, 0)
+#define load_texel_lpos(im, lpos, axis_map) \
+  texelFetch(im, lpos_to_pos(lpos, axis_map), 0)
+#endif
+
+#ifdef USING_BUFFER
+#define write_texel(buf, idx, texel) buf[idx] = texel
+#elif defined(USING_TEXTURE2D)
+#define write_texel(im, pos, texel) imageStore(im, pos.xy, texel)
+#else // defined(USING_TEXTURE3D)
+#define write_texel(im, pos, texel) imageStore(im, pos, texel)
+#define write_texel_lpos(im, lpos, texel, axis_map) \
+  imageStore(im, lpos_to_pos(lpos, axis_map), texel)
+#endif
+
+/************************
+ * Deprecated Functions *
+ ************************/
+
+// The below functions and macros are in the process of being deprecated in
+// favor of newer indexing functions that account for axis mapping and have more
+// explicit function names and more updated terminology.
+
 /*
- * Input: (w, h, c, n) tensor index, (W, H, C, N) strides of the tensor buffer
- * Returns: the buffer index corresponding to the specified tensor index
+ * Describes which texture axis the "batches" dimension runs along in a 4D
+ * texture.
+ *
+ * Currently it is set to 2 since we represent batches by concatenating along
+ * the channels dim, which has index 2 in (W, H, C, N) order and maps to the
+ * depth dimension of a texture, which also corresponds to index 2 in (x, y, z)
+ * order.
  */
-int to_buffer_id(const ivec4 tensor_idx, ivec4 strides) {
-  return tensor_idx.x * strides.x + tensor_idx.y * strides.y +
-      tensor_idx.z * strides.z + tensor_idx.w * strides.w;
-}
+#define BATCH_AXIS 2
 
 //
 // (w, h, c, n) Tensor Index <-> (x, y, z) Texture Position Conversion
@@ -218,26 +326,6 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
   return pos;
 }
 
-//
-// Texel Access and Storage
-//
-
-#ifdef USING_BUFFER
-#define load_texel(buf, idx) buf[idx]
-#elif defined(USING_TEXTURE2D)
-#define load_texel(im, pos) texelFetch(im, pos.xy, 0)
-#else // defined(USING_TEXTURE3D)
-#define load_texel(im, pos) texelFetch(im, pos, 0)
-#endif
-
-#ifdef USING_BUFFER
-#define write_texel(buf, idx, texel) buf[idx] = texel
-#elif defined(USING_TEXTURE2D)
-#define write_texel(im, pos, texel) imageStore(im, pos.xy, texel)
-#else // defined(USING_TEXTURE3D)
-#define write_texel(im, pos, texel) imageStore(im, pos, texel)
-#endif
-
 //
 // Miscellaneous Utility Functions and Macros
 //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
index b1e3a0abdfe..f7133dd0452 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
@@ -16,10 +16,11 @@ layout(std430) buffer;
 
 #extension GL_EXT_control_flow_attributes : require
 
-${layout_declare_buffer(0, "w", "nchw_out", "int")}
-${layout_declare_tensor(1, "r", "t_in", "int8", "texture3d")}
-${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
-${layout_declare_ubo(3, "int", "out_numel")}
+${layout_declare_buffer(B, "w", "nchw_out", "int")}
+${layout_declare_tensor(B, "r", "t_in", "int8", "texture3d")}
+${layout_declare_ubo(B, "ivec4", "tensor_sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_map")}
+${layout_declare_ubo(B, "int", "out_numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -40,9 +41,9 @@ void main() {
   int in_buf_idx = 4 * out_buf_idx;
 
   [[unroll]] for (int i = 0; i < 4; ++i) {
-    const ivec4 tensor_idx = from_nchw_buffer_i(in_buf_idx, tensor_sizes);
+    const ivec4 tidx = nchwi_to_tidx(in_buf_idx, tensor_sizes);
     const ivec4 texture_pos = to_texture_elem_pos(
-        tensor_idx, tensor_sizes, packed_dim);
+        tidx, tensor_sizes, packed_dim);
     values[i] = load_texel(t_in, texture_pos.xyz)[texture_pos.w];
     in_buf_idx++;
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.h b/backends/vulkan/runtime/graph/ops/glsl/matmul.h
deleted file mode 100644
index 620f1fd0e6e..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul.h
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// To convince the SPIR-V compiler to unroll the loops optimally, need this
-// macro
-#define FOUR 4
-
-#ifdef TILE_ROW_2
-#define TILE_ROWS 2
-#else
-#define TILE_ROWS 4
-#endif
-
-// we avoid mat4 and vec4 usage here as they compile to much less efficient
-// SPIR-V
-struct FloatMatrix_2d {
-  float data[TILE_ROWS][FOUR];
-};
-
-struct FloatMatrix_3d {
-  float data[TILE_ROWS][FOUR][FOUR];
-};
-
-#ifdef MAT2_IS_TRANSPOSED
-vec4 matmul_naive_W_packed_W_packed(
-#else
-vec4 matmul_naive_W_packed_H_packed(
-#endif
-    const sampler3D im_mat1,
-    const sampler3D im_mat2,
-    const ivec3 out_pos,
-    const int width) {
-  ivec3 mat1_pos = ivec3(0, out_pos.y, out_pos.z);
-#ifdef MAT2_IS_TRANSPOSED
-  ivec3 mat2_pos = ivec3(0, out_pos.x * 4, 0);
-#else
-  ivec3 mat2_pos = ivec3(out_pos.x * 4, 0, out_pos.z);
-#endif
-
-  vec4 texel = vec4(0);
-  const int K = (width + 3) / 4;
-
-  for (int i = 0; i < K; ++i) {
-    const vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0);
-#ifdef MAT2_IS_TRANSPOSED
-    const vec4 sums = vec4(
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos, 0)),
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 1, 0), 0)),
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 2, 0), 0)),
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 3, 0), 0)));
-#else
-    const vec4 sums = vec4(
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos, 0)),
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(1, 0, 0), 0)),
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(2, 0, 0), 0)),
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(3, 0, 0), 0)));
-#endif
-
-    texel += sums;
-
-    mat1_pos.x++;
-#ifdef MAT2_IS_TRANSPOSED
-    mat2_pos.x++;
-#else
-    mat2_pos.y++;
-#endif
-  }
-
-  return texel;
-}
-
-#ifdef MAT2_IS_TRANSPOSED
-vec4 matmul_naive_W_packed_H_packed(
-#else
-vec4 matmul_naive_W_packed_W_packed(
-#endif
-    const sampler3D im_mat1,
-    const sampler3D im_mat2,
-    const ivec3 out_pos,
-    const int width) {
-  ivec3 mat1_pos = ivec3(0, out_pos.y, out_pos.z);
-  ivec3 mat2_pos = ivec3(out_pos.x, 0, out_pos.z);
-
-  vec4 texel = vec4(0);
-  int K = divup4(width);
-
-  for (int i = 0; i < K; ++i) {
-    vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0);
-    texel = fma(mat1_tex.xxxx, texelFetch(im_mat2, mat2_pos, 0), texel);
-    mat2_pos.y++;
-    texel = fma(mat1_tex.yyyy, texelFetch(im_mat2, mat2_pos, 0), texel);
-    mat2_pos.y++;
-    texel = fma(mat1_tex.zzzz, texelFetch(im_mat2, mat2_pos, 0), texel);
-    mat2_pos.y++;
-    texel = fma(mat1_tex.wwww, texelFetch(im_mat2, mat2_pos, 0), texel);
-    mat2_pos.y++;
-
-    mat1_pos.x++;
-  }
-
-  return texel;
-}
-
-// get texel from self tensor (width_packed) in addmm
-vec4 get_texel_W_packed(
-    sampler3D im_self,
-    const ivec3 pos,
-    const bool broadcast_at_width,
-    const bool broadcast_at_height) {
-  vec4 self_texel;
-  // self is of shape {1}
-  if (broadcast_at_width && broadcast_at_height) {
-    self_texel = texelFetch(im_self, ivec3(0, 0, 0), 0).xxxx;
-  }
-  // self is of shape {*, 1}
-  else if (broadcast_at_width) {
-    self_texel = texelFetch(im_self, ivec3(0, pos.y, 0), 0).xxxx;
-  }
-  // self is of shape {1, *}
-  else if (broadcast_at_height) {
-    self_texel = texelFetch(im_self, ivec3(pos.x, 0, 0), 0);
-  } else {
-    self_texel = texelFetch(im_self, ivec3(pos.x, pos.y, 0), 0);
-  }
-
-  return self_texel;
-}
-
-// get texel from self tensor (channel_packed) in addmm
-vec4 get_texel_C_packed(
-    sampler3D im_self,
-    const ivec3 pos,
-    const bool broadcast_at_width,
-    const bool broadcast_at_height) {
-  vec4 self_texel;
-  // self is of shape {1}
-  if (broadcast_at_width && broadcast_at_height) {
-    self_texel = texelFetch(im_self, ivec3(0, 0, 0), 0);
-  }
-  // self is of shape {*, 1}
-  else if (broadcast_at_width) {
-    self_texel = texelFetch(im_self, ivec3(0, pos.y, 0), 0);
-  }
-  // self is of shape {1, *}
-  else if (broadcast_at_height) {
-    self_texel = texelFetch(im_self, ivec3(pos.x, 0, 0), 0);
-  } else {
-    self_texel = texelFetch(im_self, ivec3(pos.x, pos.y, 0), 0);
-  }
-
-  return self_texel;
-}
-
-FloatMatrix_2d matmul_partial_2d(
-    sampler3D im_mat1,
-    sampler3D im_mat2,
-    const ivec3 pos,
-    const int batch_size,
-    const int K_texel_len) {
-  FloatMatrix_2d results;
-  for (int i = 0; i < TILE_ROWS; i++) {
-    for (int j = 0; j < FOUR; j++) {
-      results.data[i][j] = 0.0f;
-    }
-  }
-  vec4 im_mat1_partial_load[TILE_ROWS];
-  vec4 im_mat2_partial_load[FOUR];
-
-  for (int mat1_x = 0; mat1_x < K_texel_len; mat1_x++) {
-    for (int offset = 0; offset < TILE_ROWS; offset++) {
-      // read and cache 2x4 (or 4x4) tile of im_mat1
-      const int mat1_y = (TILE_ROWS * pos.y) + offset;
-      const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, 0);
-      im_mat1_partial_load[offset] = texelFetch(im_mat1, mat1_pos, 0);
-      // read and cache 4x4 tile of im_mat2
-#ifdef MAT2_IS_TRANSPOSED
-      const int mat2_y = (FOUR * pos.x) + offset;
-      const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0);
-      im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0);
-#else
-      const int mat2_x = (FOUR * pos.x) + offset;
-      const ivec3 mat2_pos = ivec3(mat2_x, mat1_x, 0);
-      im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0);
-#endif
-    }
-
-#ifdef TILE_ROW_2
-// column 3 and 4 of im_mat2
-#ifdef MAT2_IS_TRANSPOSED
-    im_mat2_partial_load[2] =
-        texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 2, 0), 0);
-    im_mat2_partial_load[3] =
-        texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 3, 0), 0);
-#else
-    im_mat2_partial_load[2] =
-        texelFetch(im_mat2, ivec3((FOUR * pos.x) + 2, mat1_x, 0), 0);
-    im_mat2_partial_load[3] =
-        texelFetch(im_mat2, ivec3((FOUR * pos.x) + 3, mat1_x, 0), 0);
-#endif
-#endif
-
-    // perform partial dot products and add partial result to results
-    for (int out_row = 0; out_row < TILE_ROWS; out_row++) {
-      for (int out_col = 0; out_col < FOUR; out_col++) {
-        results.data[out_row][out_col] +=
-            dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]);
-      }
-    }
-  }
-  return results;
-}
-
-FloatMatrix_3d matmul_partial_3d(
-    sampler3D im_mat1,
-    sampler3D im_mat2,
-    const ivec3 pos,
-    const int batch_size,
-    const int K_texel_len) {
-  FloatMatrix_3d results;
-  for (int i = 0; i < TILE_ROWS; i++) {
-    for (int j = 0; j < FOUR; j++) {
-      for (int k = 0; k < FOUR; k++) {
-        results.data[i][j][k] = 0.0f;
-      }
-    }
-  }
-  vec4 im_mat1_partial_load[TILE_ROWS];
-  vec4 im_mat2_partial_load[FOUR];
-
-  for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) {
-    if (FOUR * pos.z + batch_idx >= batch_size) {
-      break;
-    }
-    int mat_z = FOUR * pos.z + batch_idx;
-    for (int mat1_x = 0; mat1_x < K_texel_len; mat1_x++) {
-      for (int offset = 0; offset < TILE_ROWS; offset++) {
-        // read and cache 2x4 (or 4x4) tile of im_mat1
-        const int mat1_y = (TILE_ROWS * pos.y) + offset;
-        const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, mat_z);
-        im_mat1_partial_load[offset] = texelFetch(im_mat1, mat1_pos, 0);
-        // read and cache 4x4 tile of im_mat2
-#ifdef MAT2_IS_TRANSPOSED
-        const int mat2_y = (FOUR * pos.x) + offset;
-        const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0);
-        im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0);
-#else
-        const int mat2_x = (FOUR * pos.x) + offset;
-        const ivec3 mat2_pos = ivec3(mat2_x, mat1_x, mat_z);
-        im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0);
-#endif
-      }
-
-#ifdef TILE_ROW_2
-// column 3, and 4 of im_mat2
-#ifdef MAT2_IS_TRANSPOSED
-      im_mat2_partial_load[2] =
-          texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 2, 0), 0);
-      im_mat2_partial_load[3] =
-          texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 3, 0), 0);
-#else
-      im_mat2_partial_load[2] =
-          texelFetch(im_mat2, ivec3((FOUR * pos.x) + 2, mat1_x, mat_z), 0);
-      im_mat2_partial_load[3] =
-          texelFetch(im_mat2, ivec3((FOUR * pos.x) + 3, mat1_x, mat_z), 0);
-#endif
-#endif
-
-      // perform partial dot products and add partial result to results
-      for (int out_row = 0; out_row < TILE_ROWS; out_row++) {
-        for (int out_col = 0; out_col < FOUR; out_col++) {
-          results.data[out_row][out_col][batch_idx] +=
-              dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]);
-        }
-      }
-    }
-  }
-  return results;
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl
index 81f0a815cb9..e4064eed2fa 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl
@@ -32,35 +32,29 @@ ${layout_declare_ubo(9, "int", "out_numel")}
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
-  const ivec4 out_idx = ivec4(
+  const ivec4 out_bufix = ivec4(
       gl_GlobalInvocationID.x,
       gl_GlobalInvocationID.y,
       gl_GlobalInvocationID.z % out_sizes.z,
       gl_GlobalInvocationID.z / out_sizes.z);
 
-  if (any(greaterThanEqual(out_idx, out_sizes))) {
+  if (any(greaterThanEqual(out_bufix, out_sizes))) {
     return;
   }
 
-  int mat1_id = to_buffer_id(
-      ivec4(0, out_idx.y, out_idx.z, out_idx.w), mat1_strides);
-  int mat2_id = to_buffer_id(
-      ivec4(out_idx.x, 0, out_idx.z, out_idx.w), mat2_strides);
-
-  int orig_mat1_id = to_buffer_id(
-      ivec4(0, out_idx.y, out_idx.z, out_idx.w), mat1_strides);
-
-  int orig_mat2_id = to_buffer_id(
-      ivec4(out_idx.x, 0, 0, 0), mat2_strides);
+  int mat1_bufi = tidx_to_bufi(
+      ivec4(0, out_bufix.y, out_bufix.z, out_bufix.w), mat1_strides);
+  int mat2_bufi = tidx_to_bufi(
+      ivec4(out_bufix.x, 0, out_bufix.z, out_bufix.w), mat2_strides);
 
   T sum = T(0.0);
   for (int i = 0; i < mat1_sizes.x; ++i) {
-    sum += t_mat1[mat1_id] * t_mat2[mat2_id];
+    sum += t_mat1[mat1_bufi] * t_mat2[mat2_bufi];
 
-    mat1_id += mat1_strides.x;
-    mat2_id += mat2_strides.y;
+    mat1_bufi += mat1_strides.x;
+    mat2_bufi += mat2_strides.y;
   }
 
-  const int out_id = to_buffer_id(out_idx, out_strides);
-  t_out[out_id] = T(sum);
+  const int out_bufi = tidx_to_bufi(out_bufix, out_strides);
+  t_out[out_bufi] = T(sum);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl
deleted file mode 100644
index 7225f2c64a0..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-$if MAT2_IS_TRANSPOSED:
-  #define MAT2_IS_TRANSPOSED
-
-#include "indexing_utils.h"
-#include "matmul.h"
-
-${layout_declare_tensor(0, "w", "im_out", DTYPE, "texture3d")}
-${layout_declare_tensor(1, "r", "im_mat1", DTYPE, "texture3d")}
-${layout_declare_tensor(2, "r", "im_mat2", DTYPE, "texture3d")}
-${layout_declare_ubo(3, "ivec3", "out_limits")}
-${layout_declare_ubo(4, "ivec4", "in_sizes")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  vec4 texel = vec4(0);
-
-  $if MAT1_PACKING == "W_packed":
-    $if MAT2_PACKING == "H_packed":
-      texel = matmul_naive_W_packed_H_packed(
-          im_mat1,
-          im_mat2,
-          pos,
-          in_sizes[0]);
-    $elif MAT2_PACKING == "W_packed":
-      texel = matmul_naive_W_packed_W_packed(
-          im_mat1,
-          im_mat2,
-          pos,
-          in_sizes[0]);
-    $else:
-      $raise Exception("Unsupported value for MAT2_PACKING")
-  $else:
-    $raise Exception("Unsupported value combo for MAT1_PACKING and MAT2_PACKING")
-
-  imageStore(im_out, pos, texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml
deleted file mode 100644
index bb1eed494a5..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-matmul_naive_texture3d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-    MAT1_PACKING: W_packed
-    MAT2_PACKING: H_packed
-    MAT2_IS_TRANSPOSED: false
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: matmul_naive_texture3d_W_packed_H_packed
-    - NAME: matmul_naive_texture3d_W_packed_W_packed
-      MAT2_PACKING: W_packed
-    - NAME: matmul_transposed_naive_texture3d_W_packed_W_packed
-      MAT2_PACKING: W_packed
-      MAT2_IS_TRANSPOSED: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl
deleted file mode 100644
index 8634371a7b4..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-$if MAT2_IS_TRANSPOSED:
-  #define MAT2_IS_TRANSPOSED
-
-$if BATCH_MODE:
-  #define BATCH_MODE
-
-$if TILE_ROW == "tile_row_2":
-  #define TILE_ROW_2
-
-#include "indexing_utils.h"
-#include "matmul.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-
-layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
-};
-
-layout(set = 0, binding = 5) uniform PRECISION restrict InLimits {
-  ivec3 in_limits;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  $if BATCH_MODE:
-    FloatMatrix_3d results = matmul_partial_3d(
-        im_mat1,
-        im_mat2,
-        pos,
-        out_sizes[2],
-        in_limits[0]);
-  $else:
-    FloatMatrix_2d results = matmul_partial_2d(
-        im_mat1,
-        im_mat2,
-        pos,
-        out_sizes[2],
-        in_limits[0]);
-
-  for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) {
-    for (int idx_r = 0; idx_r < FOUR; idx_r++) {
-      const ivec3 out_pos =
-          ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z);
-
-      // results is in transposed order w.r.t. the desired output
-      $if BATCH_MODE:
-        imageStore(
-          im_out,
-          out_pos,
-          vec4(
-              results.data[idx_c][idx_r][0],
-              results.data[idx_c][idx_r][1],
-              results.data[idx_c][idx_r][2],
-              results.data[idx_c][idx_r][3]));
-      $else:
-        imageStore(
-            im_out,
-            out_pos,
-            vec4(results.data[idx_c][idx_r], 0.0, 0.0, 0.0));
-    }
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml
deleted file mode 100644
index 9268d5a25aa..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-matmul_optimized:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    PACKING: C_packed
-    MAT2_IS_TRANSPOSED: false
-    BATCH_MODE: false
-    TILE_ROW: tile_row_4
-  generate_variant_forall:
-    TILE_ROW:
-      - VALUE: tile_row_4
-      - VALUE: tile_row_2
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: matmul_optimized
-    - NAME: matmul_transposed_optimized
-      MAT2_IS_TRANSPOSED: true
-    - NAME: batch_matmul_optimized
-      BATCH_MODE: true
-    - NAME: batch_matmul_transposed_optimized
-      MAT2_IS_TRANSPOSED: true
-      BATCH_MODE: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
index d861972f935..ea4e0d300cc 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
@@ -23,13 +23,13 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM;
 
 void main() {
-  int out_id = int(gl_GlobalInvocationID.x);
-  if (out_id >= numel) {
+  int out_bufi = int(gl_GlobalInvocationID.x);
+  if (out_bufi >= numel) {
     return;
   }
 
-  ivec4 out_idx = to_tensor_idx(out_id, out_strides);
-  const int in_id = to_nchw_buffer_i(out_idx, out_sizes);
+  ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides);
+  const int in_nchwi = tidx_to_nchwi(out_tidx, out_sizes);
 
-  t_out[out_id] = nchw_in[in_id];
+  t_out[out_bufi] = nchw_in[in_nchwi];
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
index abe93904805..b86a59fc234 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -21,42 +21,43 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_buffer(1, "r", "nchw_in", DTYPE)}
-${layout_declare_ubo(2, "ivec4", "sizes")}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_buffer(B, "r", "nchw_in", DTYPE)}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_map")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
-VEC4_T read_texel(ivec4 tensor_idx) {
-  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
-      tensor_idx,
+VEC4_T read_texel(ivec4 tidx) {
+  const ivec4 buf_indices = tidx_to_nchwi(
+      tidx,
       sizes,
       packed_dim);
 
   VEC4_T texel = VEC4_T(0);
-  if (tensor_idx[packed_dim] < sizes[packed_dim]) {
+  if (tidx[packed_dim] < sizes[packed_dim]) {
     texel.x = SCALAR_T(nchw_in[buf_indices.x]);
   }
-  if (tensor_idx[packed_dim] + 1 < sizes[packed_dim]) {
+  if (tidx[packed_dim] + 1 < sizes[packed_dim]) {
     texel.y = SCALAR_T(nchw_in[buf_indices.y]);
   }
-  if (tensor_idx[packed_dim] + 2 < sizes[packed_dim]) {
+  if (tidx[packed_dim] + 2 < sizes[packed_dim]) {
     texel.z = SCALAR_T(nchw_in[buf_indices.z]);
   }
-  if (tensor_idx[packed_dim] + 3 < sizes[packed_dim]) {
+  if (tidx[packed_dim] + 3 < sizes[packed_dim]) {
     texel.w = SCALAR_T(nchw_in[buf_indices.w]);
   }
   return texel;
 }
 
 void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim);
-  if (any(greaterThanEqual(tensor_idx, sizes))) {
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim);
+  if (any(greaterThanEqual(tidx, sizes))) {
     return;
   }
 
-  write_texel(t_out, pos, read_texel(tensor_idx));
+  write_texel(t_out, lpos_to_pos(lpos, axis_map), read_texel(tidx));
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl
index 378cf09d129..f3a3370f3ba 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl
@@ -16,9 +16,10 @@ layout(std430) buffer;
 
 #extension GL_EXT_control_flow_attributes : require
 
-${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")}
-${layout_declare_buffer(1, "r", "nchw_in", "int")}
-${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
+${layout_declare_tensor(B, "w", "t_out", "int8", "texture3d")}
+${layout_declare_buffer(B, "r", "nchw_in", "int")}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_map")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -34,9 +35,9 @@ int extend_sign(int x) {
   return x;
 }
 
-ivec4 read_texel(ivec4 tensor_idx) {
-  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
-      tensor_idx, tensor_sizes, packed_dim);
+ivec4 read_texel(ivec4 tidx) {
+  const ivec4 buf_indices = tidx_to_nchwi(
+      tidx, sizes, packed_dim);
 
   int shift = (1 << 8) - 1;
   ivec4 masks;
@@ -51,7 +52,7 @@ ivec4 read_texel(ivec4 tensor_idx) {
   ivec4 out_tex = ivec4(0);
 
   [[unroll]] for (int i = 0; i < 4; ++i) {
-    if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) {
+    if (tidx[packed_dim] + i < sizes[packed_dim]) {
       int in_texel = nchw_in[buf_indices[i] / 4];
       int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4));
       extracted_val = extend_sign(extracted_val);
@@ -63,12 +64,12 @@ ivec4 read_texel(ivec4 tensor_idx) {
 }
 
 void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim);
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim);
 
-  if (any(greaterThanEqual(tensor_idx, tensor_sizes))) {
+  if (any(greaterThanEqual(tidx, sizes))) {
     return;
   }
 
-  write_texel(t_out, pos, read_texel(tensor_idx));
+  write_texel(t_out, lpos_to_pos(lpos, axis_map), read_texel(tidx));
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl
new file mode 100644
index 00000000000..de42f9ed996
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#include "indexing_utils.h"
+
+#define PRECISION ${PRECISION}
+
+#define FOUR 4
+
+#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+#define FLOAT_T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type(STORAGE)}
+
+${define_required_extensions(DTYPE)}
+${define_required_extensions("int8")}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(1, "r", "t_mat1", DTYPE, STORAGE)}
+${layout_declare_tensor(2, "r", "t_mat2", "int8", "buffer")}
+${layout_declare_tensor(3, "r", "t_scales_and_zeros", DTYPE, STORAGE)}
+
+$if STORAGE == "texture3d":
+  ${layout_declare_ubo(4, "ivec4", "out_sizes")}
+  ${layout_declare_ubo(5, "ivec4", "mat1_sizes")}
+  ${layout_declare_ubo(6, "ivec4", "mat2_strides")}
+  ${layout_declare_ubo(7, "ivec4", "scales_strides")}
+$else:
+  ${layout_declare_ubo(4, "ivec4", "out_sizes")}
+  ${layout_declare_ubo(5, "ivec4", "out_strides")}
+  ${layout_declare_ubo(6, "ivec4", "mat1_sizes")}
+  ${layout_declare_ubo(7, "ivec4", "mat1_strides")}
+  ${layout_declare_ubo(8, "ivec4", "mat2_strides")}
+  ${layout_declare_ubo(9, "ivec4", "scales_strides")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int group_size = 1;
+
+void main() {
+
+    const ivec4 out_pos = ivec4(
+      gl_GlobalInvocationID.x, // n = 0..N-1
+      gl_GlobalInvocationID.y, // m = 0..M-1
+      gl_GlobalInvocationID.z % out_sizes.z,
+      gl_GlobalInvocationID.z / out_sizes.z);
+
+    if (any(greaterThanEqual(out_pos, out_sizes))) {
+      return;
+    }
+
+    const uint K = mat1_sizes.x;
+    const uint n = out_pos.x;
+    const uint m = out_pos.y;
+    const uint mask = uint(0x0f);
+
+    float rc = 0.0;
+    int k = 0;
+    const uint k_block = (K + group_size - 1) / group_size;
+
+    #ifdef USING_BUFFER
+      ivec4 mat1_pos = ivec4(0, m, out_pos.z, out_pos.w);
+      ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w);
+      ivec4 scale_pos = ivec4(0, n, 0, out_pos.w);
+      ivec4 zero_pos = ivec4(0, n, 1, out_pos.w);
+
+      for (int kb = 0; kb < k_block; kb++) {
+        scale_pos.x = kb;
+        const int scale_bufi = tidx_to_bufi(scale_pos, scales_strides);
+        const float scale = float(t_scales_and_zeros[scale_bufi]);
+
+        zero_pos.x = kb;
+        const int zero_bufi = tidx_to_bufi(zero_pos, scales_strides);
+        const float zero = float(t_scales_and_zeros[zero_bufi]) - scale * 8.0;
+
+        for(uint idx = 0; idx < group_size && k < K; idx++, k++) {
+          mat1_pos.x = k;
+          const int mat1_bufi = tidx_to_bufi(mat1_pos, mat1_strides);
+          const float mat1_val = float(t_mat1[mat1_bufi]);
+
+          mat2_pos.x = k / 2;
+          const int mat2_bufi = tidx_to_bufi(mat2_pos, mat2_strides);
+          // Bitwise op treats sign bit from int8 as a value bit instead,
+          // since there is no uint8_t datatype
+          uint mat2_val = (t_mat2[mat2_bufi] & 0xFF);
+          mat2_val = (k & 1) == 0 ? mat2_val & mask : (mat2_val >> 4);
+
+          rc += mat1_val * (scale * float(mat2_val) + zero);
+        }
+      }
+
+      const int out_bufi = tidx_to_bufi(out_pos, out_strides);
+      t_out[out_bufi] = FLOAT_T(rc);
+
+    #else // Using texture
+      ivec3 mat1_pos = ivec3(0, m, out_pos.z);
+      ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w);
+      ivec3 scale_zero_pos = ivec3(0, n, 0);
+      uint K_texel = K / FOUR;
+
+      for (int kb = 0; kb < k_block; kb++) {
+        scale_zero_pos.x = kb;
+        const vec4 scale_zero = load_texel(t_scales_and_zeros, scale_zero_pos);
+        const float scale = scale_zero.x;
+        const float zero = scale_zero.y - scale * 8.0;
+
+        for(uint idx = 0; idx < group_size && k < K_texel; idx += FOUR, k++) {
+          mat1_pos.x = k;
+          const VEC4_T mat1_tex = load_texel(t_mat1, mat1_pos);
+
+          mat2_pos.x = k * 2; // k * FOUR / 2
+          const int mat2_id = tidx_to_bufi(mat2_pos, mat2_strides);
+
+          for (int texel_pos = 0; texel_pos < FOUR; texel_pos++) {
+            // Bitwise op treats sign bit from int8 as a value bit instead,
+            // since there is no uint8_t datatype
+            uint mat2_val = (t_mat2[mat2_id + texel_pos / 2] & 0xFF);
+            mat2_val = (texel_pos & 1) == 0 ? mat2_val & mask : (mat2_val >> 4);
+            rc += mat1_tex[texel_pos] * (scale * float(mat2_val) + zero);
+          }
+        }
+      }
+      write_texel(t_out, out_pos.xyz, vec4(rc, 0, 0, 0));
+
+    #endif
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml
new file mode 100644
index 00000000000..fd65068080a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+q_4w_linear:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+    STORAGE:
+      - VALUE: buffer
+      - VALUE: texture3d
+  shader_variants:
+    - NAME: q_4w_linear
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
index 7557a7b0c3d..a72df89b634 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -49,14 +49,14 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 #ifdef USING_BUFFER
 
 void main() {
-  const int t_id = int(gl_GlobalInvocationID.x);
-  if (t_id >= out_numel) {
+  const int out_bufi = int(gl_GlobalInvocationID.x);
+  if (out_bufi >= out_numel) {
     return;
   }
 
-  const ivec4 out_idx = to_tensor_idx(t_id, out_strides, 0);
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, 0);
 
-  t_out[t_id] = q_8w_linear(out_idx, mat1_sizes.x);
+  t_out[out_bufi] = q_8w_linear(out_tidx, mat1_sizes.x);
 }
 
 #else // USING_TEXTURE
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
index d1562d65762..45e6c3358e8 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
@@ -43,11 +43,11 @@ void main() {
   // we calculate the source whcn-coordinate amended with offset-ed channel
   // value.  Then we calculate the actual texture position from the
   // whcn-coordinate.
-  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(idx, out_sizes, packed_dim);
+  const ivec4 buf_indices = tidx_to_nchwi(idx, out_sizes, packed_dim);
 
   vec4 outex;
   for (int i=0;i<4;i++) {
-      ivec4 user_coor = from_nchw_buffer_i(buf_indices[i], out_sizes);
+      ivec4 user_coor = nchwi_to_tidx(buf_indices[i], out_sizes);
 
       int in_channel = user_coor.z;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
index eb05b10b108..2b9f0032f41 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
@@ -38,3 +38,5 @@ unary_op:
       OPERATOR: hardshrink(X, A, B)
     - NAME: hardswish
       OPERATOR: hardswish(X)
+    - NAME: hardsigmoid
+      OPERATOR: hardsigmoid(X)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/view.glsl b/backends/vulkan/runtime/graph/ops/glsl/view.glsl
index 0b0f587d1d5..8d45e65b396 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/view.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/view.glsl
@@ -39,13 +39,13 @@ void main() {
   // Assume there is a virtual continous buffer in nchw format. From the output
   // pos, we first calculate the index in the virual buffer, and then calculate
   // the input position from the indx.
-  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(out_tensor_idx, out_sizes, out_packed_dim);
+  const ivec4 buf_indices = tidx_to_nchwi(out_tensor_idx, out_sizes, out_packed_dim);
 
   VEC4_T value = VEC4_T(0);
   // Need to look up the 4 values in the output texel separately.
   for (int i = 0 ; i < 4; i++) {
     if (out_tensor_idx[out_packed_dim]++ < out_sizes[out_packed_dim]) {
-      ivec4 user_coor = from_nchw_buffer_i(buf_indices[i], in_sizes);
+      ivec4 user_coor = nchwi_to_tidx(buf_indices[i], in_sizes);
       ivec4 in_pos_elem = to_texture_elem_pos(user_coor, in_sizes, in_packed_dim);
       VEC4_T intex = texelFetch(t_in, in_pos_elem.xyz, 0);
       value[i] = intex[in_pos_elem.w];
diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
index 8e346bd2088..eb0f1f99a2f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
@@ -88,7 +88,7 @@ void add_native_batch_norm_node(
       {{out_ref, vkapi::MemoryAccessType::WRITE},
        {{in_ref, arg_weight, arg_bias, arg_mean, arg_var},
         vkapi::MemoryAccessType::READ}},
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        graph.create_params_buffer(epsilon),
        graph.create_params_buffer(num_texel_per_batch)}));
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index 6bab8d19111..3ae67489af9 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -21,7 +21,7 @@ void check_binary_op_args(
     const api::vTensor& self,
     const api::vTensor& other,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_same_memory_layout(self, other, out));
+  VK_CHECK_COND(check_same_packed_dim(self, other, out));
   std::vector<int64_t> broadcasted_sizes =
       calculate_broadcasted_output_size(self, other);
   VK_CHECK_COND(out.sizes() == broadcasted_sizes);
@@ -53,7 +53,7 @@ void add_binary_op_node(
     const std::string& op_name) {
   ValueRef arg1 = prepack_if_tensor_ref(graph, in1);
   ValueRef arg2 =
-      prepack_if_tensor_ref(graph, in2, graph.memory_layout_of(arg1));
+      prepack_if_tensor_ref(graph, in2, graph.estimate_memory_layout_of(arg1));
 
   vTensorPtr t_in1 = graph.get_tensor(arg1);
   vTensorPtr t_in2 = graph.get_tensor(arg2);
@@ -85,12 +85,15 @@ void add_binary_op_node(
        {{arg1, arg2}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {t_out->sizes_ubo(),
+       t_out->axis_map_ubo(),
        t_in1->sizes_ubo(),
+       t_in1->axis_map_ubo(),
        t_in2->sizes_ubo(),
+       t_in2->axis_map_ubo(),
        graph.create_params_buffer(broadcast_params),
        graph.create_params_buffer(alpha_val)},
       // Specialization Constants
-      {SV(t_out->packed_dim_whcn_idx())},
+      {SV(t_out->packed_dim())},
       // Resizing Logic
       resize_binary_op_node,
       {}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
index cd947091bc1..d5cfd5f4505 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
@@ -25,7 +25,7 @@ void add_cat_default_node(
 
   for (ValueRef input_ref : *input_list) {
     vTensorPtr t_in = graph.get_tensor(input_ref);
-    VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked));
+    VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
   }
 
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
@@ -40,7 +40,7 @@ void add_cat_default_node(
 
     for (ValueRef input_ref : *input_list) {
       vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->texture_limits();
+      utils::ivec3 range = t_in->logical_limits();
       add_copy_offset_node(
           graph, input_ref, range, src_offset, dst_offset, out);
       dst_offset[0] += range[0];
@@ -52,7 +52,7 @@ void add_cat_default_node(
 
     for (ValueRef input_ref : *input_list) {
       vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->texture_limits();
+      utils::ivec3 range = t_in->logical_limits();
       add_copy_offset_node(
           graph, input_ref, range, src_offset, dst_offset, out);
       dst_offset[1] += range[1];
@@ -63,7 +63,7 @@ void add_cat_default_node(
 
     for (ValueRef input_ref : *input_list) {
       vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->texture_limits();
+      utils::ivec3 range = t_in->logical_limits();
       add_copy_offset_node(
           graph, input_ref, range, src_offset, dst_offset, out);
       dst_offset[2] += range[2];
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
index cef751bc7c8..946a0c9f407 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
@@ -32,7 +32,7 @@ void add_clone_node(
       graph.create_local_wg_size(out),
       {{out, vkapi::MemoryAccessType::WRITE},
        {in, vkapi::MemoryAccessType::READ}},
-      {t_out->texture_limits_ubo()}));
+      {t_out->logical_limits_ubo()}));
 }
 
 void clone(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 52af0542b6a..360193fb17f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -106,9 +106,9 @@ ValueRef prepack_biases(
       graph.create_local_wg_size(v),
       vref,
       v,
-      {t->sizes_ubo()},
+      {t->sizes_ubo(), t->axis_map_ubo()},
       // Specialization constants
-      {SV(t->packed_dim_whcn_idx())}));
+      {SV(t->packed_dim())}));
 
   return v;
 }
@@ -216,14 +216,14 @@ ValueRef prepack_weights(
        graph.create_params_buffer(
            utils::make_ivec4(original_sizes, /*reverse = */ true))},
       // Specialization constants
-      {SV(t->packed_dim_whcn_idx())}));
+      {SV(t->packed_dim())}));
 
   return v;
 }
 
 void check_conv_args(const api::vTensor& in, const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
 
 struct Conv2dParams final {
@@ -291,7 +291,7 @@ utils::uvec3 create_conv2d_global_wg_size(
     const Conv2dMethod method,
     const ValueRef out) {
   if (method == Conv2dMethod::Pointwise) {
-    const utils::uvec3 image_extents = graph.image_extents_of(out);
+    const utils::uvec3 image_extents = graph.logical_limits_of(out);
     return {
         utils::div_up(image_extents[0u], 2u),
         utils::div_up(image_extents[1u], 2u),
@@ -376,7 +376,7 @@ void add_conv2d_node(
        {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          t_out->texture_limits_ubo(),
+          t_out->logical_limits_ubo(),
           t_in->sizes_ubo(),
           graph.create_params_buffer(kernel_params),
           graph.create_params_buffer(extra_params),
@@ -444,7 +444,7 @@ void add_conv1d_node(
   int32_t out_group_size = static_cast<int64_t>(out_channels / groups_val);
 
   utils::uvec3 global_size = {1, static_cast<uint32_t>(out_channels), 1};
-  utils::uvec3 local_size = {1, 1, 1};
+  utils::uvec3 local_size = {1, 64, 1};
 
   Kernel1dParams kernel_params = {
       kernel_size,
@@ -474,8 +474,12 @@ void add_conv1d_node(
        {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          t_out->texture_limits_ubo(),
+          t_out->logical_limits_ubo(),
           t_in->sizes_ubo(),
+          t_out->axis_map_ubo(),
+          t_in->axis_map_ubo(),
+          t_weight->axis_map_ubo(),
+          t_bias->axis_map_ubo(),
           graph.create_params_buffer(kernel_params),
           graph.create_params_buffer(out_params),
       },
@@ -562,6 +566,7 @@ void conv(ComputeGraph& graph, const std::vector<ValueRef>& args) {
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.convolution.default, conv);
   VK_REGISTER_OP(conv_with_clamp.default, conv);
+  VK_REGISTER_OP(et_vk.conv_with_clamp.default, conv);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
index b15844e1409..1fe65611d9f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -33,19 +33,13 @@ void add_copy_offset_node(
   add_dtype_suffix(kernel_name, *t_out);
 
   const struct Block final {
-    ivec3 range;
-    int32_t unused0;
-    ivec3 src_offset;
-    int32_t unused1;
-    ivec3 dst_offset;
-    int32_t unused2;
+    alignas(16) ivec3 range;
+    alignas(16) ivec3 src_offset;
+    alignas(16) ivec3 dst_offset;
   } offset_params{
       range,
-      0,
       src_offset,
-      0,
       dst_offset,
-      0,
   };
 
   auto shader = VK_KERNEL_FROM_STR(kernel_name);
@@ -61,7 +55,11 @@ void add_copy_offset_node(
           {in, vkapi::MemoryAccessType::READ},
       },
       // Parameter buffers
-      {graph.create_params_buffer(offset_params)},
+      {
+          graph.create_params_buffer(offset_params),
+          t_out->axis_map_ubo(),
+          t_in->axis_map_ubo(),
+      },
       // Specialization Constants
       {}));
 }
@@ -80,8 +78,8 @@ void add_copy_channel_offset_node(
   std::vector<int64_t> in_sizes = t_in->sizes();
   std::vector<int64_t> out_sizes = t_out->sizes();
 
-  VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(*t_out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
 
   // NOTE: This function should be able to support 1d and 2d tensors when
   // range=1, src_offset=dst_offset=1.
diff --git a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
index be0b457b79c..2d733b4964c 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
@@ -21,9 +21,9 @@ void check_embedding_args(
     const api::vTensor& weight,
     const api::vTensor& in,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(weight, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(weight, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
 
 void add_embedding_node(
@@ -48,7 +48,12 @@ void add_embedding_node(
       graph.create_local_wg_size(out),
       {{out, vkapi::MemoryAccessType::WRITE},
        {{in, weight}, vkapi::MemoryAccessType::READ}},
-      {t_out->sizes_ubo()}));
+      {
+          t_out->sizes_ubo(),
+          t_out->axis_map_ubo(),
+          t_in->axis_map_ubo(),
+          t_weight->axis_map_ubo(),
+      }));
 }
 
 void embedding(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Full.cpp b/backends/vulkan/runtime/graph/ops/impl/Full.cpp
index 157515e6e0a..34acb43c668 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Full.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Full.cpp
@@ -54,7 +54,7 @@ void add_full_node(
       // Shader params buffers
       {t_out->sizes_ubo(), graph.create_params_buffer(fill_value_val)},
       // Specialization Constants
-      {SV(t_out->packed_dim_whcn_idx())},
+      {SV(t_out->packed_dim())},
       // Resizing Logic
       resize_full_node,
       {size_or_in}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp b/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp
index 7b4e45262c0..d9a0cdedd79 100644
--- a/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp
@@ -21,9 +21,9 @@ void check_index_select_args(
     const api::vTensor& in,
     const api::vTensor& idx,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(idx, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(idx, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
 
 void add_index_select_channel_node(
diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
index 63b60bf52f7..b96b8840026 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
@@ -36,7 +36,7 @@ void check_addmm_args(
   VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3);
   VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size());
 
-  VK_CHECK_COND(graph.memory_layout_of(mat1) == graph.memory_layout_of(out));
+  VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out));
 
   VK_CHECK_COND(utils::val_at(-1, mat1_sizes) == utils::val_at(-2, mat2_sizes));
 
@@ -100,27 +100,36 @@ void add_addmm_naive_node(
   std::string kernel_name =
       graph.get_bool(mat2_is_transposed) ? "linear_naive" : "addmm_naive";
   kernel_name.reserve(kShaderNameReserve);
-  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1));
-  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2));
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
+  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
       // Inputs and Outputs
       {{out, vkapi::MemoryAccessType::WRITE},
        {{mat1, mat2, self}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          graph.texture_limits_ubo(out),
+          graph.sizes_ubo(out),
+          graph.logical_limits_ubo(out),
+          graph.axis_map_ubo(out),
           graph.sizes_ubo(mat1),
+          graph.axis_map_ubo(mat1),
+          graph.sizes_ubo(mat2),
+          graph.axis_map_ubo(mat2),
           graph.sizes_ubo(self),
+          graph.axis_map_ubo(self),
           graph.create_params_buffer(params),
       },
       // Specialization Constants
-      {},
+      {graph.packed_dim_of(out),
+       graph.packed_dim_of(mat1),
+       graph.packed_dim_of(mat2),
+       graph.packed_dim_of(self)},
       // Resizing Logic
       resize_addmm_node,
       {mat2_is_transposed}));
@@ -151,7 +160,7 @@ void add_addmm_optimized_node(
   ValueRef mat2_packed = mat2;
   const utils::GPUMemoryLayout mat2_layout =
       mat2_is_transposed_val ? utils::kWidthPacked : utils::kHeightPacked;
-  if (graph.memory_layout_of(mat2) != mat2_layout) {
+  if (graph.estimate_memory_layout_of(mat2) != mat2_layout) {
     mat2_packed = graph.add_tensor_like(mat2, mat2_layout);
     viewFn(graph, {mat2, graph.add_none(), mat2_packed});
   }
@@ -173,11 +182,20 @@ void add_addmm_optimized_node(
 
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
-  utils::uvec3 global_size;
+  utils::uvec3 global_size = graph.logical_limits_of(out);
+
+  // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the
+  // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is
+  // channels packed, C does not need to be divided by 4. The "identity" of each
+  // thread is the (x, y, z) coordinate of the output tile it is computing, and
+  // this identity can be used to compute the tensor index of the top left
+  // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0]
   if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1});
+    // Use `logical_extents` instead of `image_extents` because the workgroup
+    // axes need to correspond to tensor dimensions.
+    global_size = utils::divup_vec(global_size, {4, 2, 1});
   } else {
-    global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1});
+    global_size = utils::divup_vec(global_size, {4, 4, 1});
   }
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
@@ -191,14 +209,18 @@ void add_addmm_optimized_node(
        {{mat1_W_packed, mat2_packed, self}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          graph.texture_limits_ubo(out),
           graph.sizes_ubo(out),
+          graph.axis_map_ubo(out),
+          graph.sizes_ubo(mat1_W_packed),
+          graph.axis_map_ubo(mat1_W_packed),
+          graph.sizes_ubo(mat2_packed),
+          graph.axis_map_ubo(mat2_packed),
           graph.sizes_ubo(self),
-          graph.texture_limits_ubo(mat1_W_packed),
+          graph.axis_map_ubo(self),
           graph.create_params_buffer(params),
       },
       // Specialization Constants
-      {},
+      {graph.packed_dim_of(out)},
       // Resizing Logic
       resize_addmm_node,
       {mat2_is_transposed}));
@@ -224,10 +246,10 @@ void add_addmm_node(
   }
 
   Params params = {alpha_val, beta_val};
-  if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) {
+  if (graph.packed_dim_of(mat1) == WHCN::kChannelsDim) {
     add_addmm_optimized_node(
         graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed);
-  } else if (graph.memory_layout_of(mat1) == utils::kWidthPacked) {
+  } else if (graph.packed_dim_of(mat1) == WHCN::kWidthDim) {
     add_addmm_naive_node(
         graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed);
   } else {
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
index 2d9346e1340..1034dc445e8 100644
--- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -29,7 +29,7 @@ void check_matmul_args(
   VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3);
   VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size());
 
-  VK_CHECK_COND(graph.memory_layout_of(mat1) == graph.memory_layout_of(out));
+  VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out));
 
   VK_CHECK_COND(utils::val_at(-1, mat1_sizes) == utils::val_at(-2, mat2_sizes));
 }
@@ -116,25 +116,31 @@ void add_matmul_naive_texture3d_node(
       : "matmul_naive";
   kernel_name.reserve(kShaderNameReserve);
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1));
-  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
+  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
       // Inputs and Outputs
       {{out, vkapi::MemoryAccessType::WRITE},
        {{mat1, mat2}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          graph.texture_limits_ubo(out),
+          graph.sizes_ubo(out),
+          graph.logical_limits_ubo(out),
+          graph.axis_map_ubo(out),
           graph.sizes_ubo(mat1),
+          graph.axis_map_ubo(mat1),
+          graph.sizes_ubo(mat2),
+          graph.axis_map_ubo(mat2),
       },
       // Specialization Constants
-      {},
+      {graph.packed_dim_of(out),
+       graph.packed_dim_of(mat1),
+       graph.packed_dim_of(mat2)},
       // Resizing Logic
       resize_matmul_node,
       {mat2_is_transposed}));
@@ -159,7 +165,7 @@ void add_matmul_optimized_node(
   ValueRef mat2_packed = mat2;
   const utils::GPUMemoryLayout mat2_layout =
       mat2_is_transposed_val ? utils::kWidthPacked : utils::kHeightPacked;
-  if (graph.memory_layout_of(mat2) != mat2_layout) {
+  if (graph.estimate_memory_layout_of(mat2) != mat2_layout) {
     mat2_packed = graph.add_tensor_like(mat2, mat2_layout);
     viewFn(graph, {mat2, graph.add_none(), mat2_packed});
   }
@@ -181,12 +187,21 @@ void add_matmul_optimized_node(
 
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
-  utils::uvec3 global_size;
+  // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the
+  // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is
+  // channels packed, C does not need to be divided by 4. The "identity" of each
+  // thread is the (x, y, z) coordinate of the output tile it is computing, and
+  // this identity can be used to compute the tensor index of the top left
+  // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0]
+  utils::uvec3 global_size = graph.logical_limits_of(out);
   if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1});
+    // Use `logical_extents` instead of `image_extents` because the workgroup
+    // axes need to correspond to tensor dimensions.
+    global_size = utils::divup_vec(global_size, {4, 2, 1});
   } else {
-    global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1});
+    global_size = utils::divup_vec(global_size, {4, 4, 1});
   }
+
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -199,12 +214,15 @@ void add_matmul_optimized_node(
        {{mat1_W_packed, mat2_packed}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          graph.texture_limits_ubo(out),
           graph.sizes_ubo(out),
-          graph.texture_limits_ubo(mat1_W_packed),
+          graph.axis_map_ubo(out),
+          graph.sizes_ubo(mat1_W_packed),
+          graph.axis_map_ubo(mat1_W_packed),
+          graph.sizes_ubo(mat2_packed),
+          graph.axis_map_ubo(mat2_packed),
       },
       // Specialization Constants
-      {},
+      {graph.packed_dim_of(out)},
       // Resizing Logic
       resize_matmul_node,
       {mat2_is_transposed}));
@@ -219,13 +237,13 @@ void add_matmul_node(
   if (graph.is_buffer_storage(out)) {
     add_matmul_naive_buffer_node(
         graph, mat1, mat2_data, out, mat2_is_transposed);
-  } else if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) {
+  } else if (graph.packed_dim_of(mat1) == WHCN::kChannelsDim) {
     add_matmul_optimized_node(graph, mat1, mat2_data, out, mat2_is_transposed);
-  } else if (graph.memory_layout_of(mat1) == utils::kWidthPacked) {
+  } else if (graph.packed_dim_of(mat1) == WHCN::kWidthDim) {
     add_matmul_naive_texture3d_node(
         graph, mat1, mat2_data, out, mat2_is_transposed);
   } else {
-    VK_THROW("Input should be channel packed or width packed.");
+    VK_THROW("Input texture should be channel packed or width packed.");
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
index 2b15d924706..553075fc4bb 100644
--- a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
@@ -49,8 +49,8 @@ void resize_native_layer_norm_node(
 }
 
 void check_args(const api::vTensor& in, const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
 
 void add_native_layer_norm_node(
@@ -76,10 +76,10 @@ void add_native_layer_norm_node(
   }
 
   ValueRef arg_in = prepack_if_tensor_ref(graph, in);
-  ValueRef arg_weight =
-      prepack_if_tensor_ref(graph, weight, graph.memory_layout_of(arg_in));
-  ValueRef arg_bias =
-      prepack_if_tensor_ref(graph, bias, graph.memory_layout_of(arg_in));
+  ValueRef arg_weight = prepack_if_tensor_ref(
+      graph, weight, graph.estimate_memory_layout_of(arg_in));
+  ValueRef arg_bias = prepack_if_tensor_ref(
+      graph, bias, graph.estimate_memory_layout_of(arg_in));
 
   const auto out_val = graph.get_value_list(out);
   vTensorPtr t_out = graph.get_tensor(out_val->at(0));
@@ -91,7 +91,7 @@ void add_native_layer_norm_node(
 
   std::vector<int64_t> in_sizes = t_input->sizes();
 
-  utils::uvec3 global_size = t_mean->image_extents();
+  utils::uvec3 global_size = t_mean->logical_limits();
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   std::string kernel_name("native_layer_norm");
@@ -109,7 +109,7 @@ void add_native_layer_norm_node(
         vkapi::MemoryAccessType::WRITE},
        {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        t_out->sizes_ubo(),
        graph.create_params_buffer(epsilon)},
       // Specialization Constants
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
index e78fca15a0a..e45a333123d 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -28,8 +28,8 @@ void check_args(
     const api::vTensor& in,
     const std::vector<int64_t>& permute_dims,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 
   // This implementation doesn't not requires the input tensor to have the same
   // dim size as the argument. The code will work as long as the input tensor's
@@ -90,7 +90,7 @@ void add_permute_node(
       graph.create_local_wg_size(out),
       {{out, vkapi::MemoryAccessType::WRITE},
        {in, vkapi::MemoryAccessType::READ}},
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        t_out->sizes_ubo(),
        graph.create_params_buffer(params)},
       // Specialization Constants
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
index 8b477d3a31a..ba8d971a1af 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -18,8 +18,8 @@
 namespace vkcompute {
 
 void check_pool2d_args(const api::vTensor& in, const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
 
 void resize_pool2d_node(
@@ -79,7 +79,7 @@ void add_max_pool2d_node(
 
   check_pool2d_args(*t_in, *t_out);
 
-  utils::uvec3 global_size = t_out->image_extents();
+  utils::uvec3 global_size = t_out->logical_limits();
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   std::string kernel_name("max_pool2d");
@@ -103,7 +103,7 @@ void add_max_pool2d_node(
        {arg, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          t_out->texture_limits_ubo(),
+          t_out->logical_limits_ubo(),
           t_in->sizes_ubo(),
           graph.create_params_buffer(kernel_params),
       },
@@ -155,7 +155,7 @@ void add_avg_pool2d_node(
 
   check_pool2d_args(*t_in, *t_out);
 
-  utils::uvec3 global_size = t_out->image_extents();
+  utils::uvec3 global_size = t_out->logical_limits();
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   std::string kernel_name("avg_pool2d");
@@ -176,7 +176,7 @@ void add_avg_pool2d_node(
       {{out, vkapi::MemoryAccessType::WRITE},
        {arg, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        t_in->sizes_ubo(),
        graph.create_params_buffer(kernel_params),
        graph.create_params_buffer(divisor_params)},
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
index 732643ef754..28bf6513957 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -30,7 +30,7 @@ void check_qlinear_args(
   VK_CHECK_COND(qmat2_sizes.size() == 2);
   VK_CHECK_COND(scales_sizes.size() == 1);
 
-  VK_CHECK_COND(graph.memory_layout_of(mat1) == graph.memory_layout_of(out));
+  VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out));
 
   VK_CHECK_COND(
       utils::val_at(-1, mat1_sizes) == utils::val_at(-1, qmat2_sizes));
@@ -78,8 +78,8 @@ void add_q_8w_linear_node(
 
   std::string kernel_name = "q_8w_linear";
   kernel_name.reserve(kShaderNameReserve);
-  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1));
-  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(q_mat2));
+  add_packed_dim_suffix(kernel_name, graph.packed_dim_of(mat1));
+  add_packed_dim_suffix(kernel_name, graph.packed_dim_of(q_mat2));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
 
@@ -94,7 +94,7 @@ void add_q_8w_linear_node(
          graph.strides_ubo(q_mat2),
          graph.strides_ubo(scales)});
   } else {
-    ubos.append({graph.texture_limits_ubo(out), graph.sizes_ubo(mat1)});
+    ubos.append({graph.logical_limits_ubo(out), graph.sizes_ubo(mat1)});
   }
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp
new file mode 100644
index 00000000000..17bd62ad6ea
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void check_q_matmul_args(
+    ComputeGraph& graph,
+    const ValueRef mat1,
+    const ValueRef mat2_data,
+    const ValueRef group_size_data,
+    const ValueRef scales_and_zeros,
+    const ValueRef out) {
+  const std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1);
+  const std::vector<int64_t> mat2_sizes = graph.sizes_of(mat2_data);
+  const std::vector<int64_t> scales_and_zeros_sizes =
+      graph.sizes_of(scales_and_zeros);
+
+  const uint32_t group_size = graph.extract_scalar<uint32_t>(group_size_data);
+
+  VK_CHECK_COND(mat1_sizes.size() == 2);
+  VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size());
+
+  using namespace WHCN;
+  VK_CHECK_COND(graph.packed_dim_of(mat1) == kWidthDim);
+  VK_CHECK_COND(graph.packed_dim_of(mat2_data) == kWidthDim);
+  // VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim);
+
+  if (graph.storage_type_of(scales_and_zeros) == utils::kBuffer) {
+    VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim);
+  } else {
+    VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kChannelsDim);
+  }
+
+  if (graph.storage_type_of(out) == utils::kBuffer) {
+    VK_CHECK_COND(graph.packed_dim_of(out) == kWidthDim);
+  } else {
+    VK_CHECK_COND(graph.packed_dim_of(out) == kChannelsDim);
+  }
+
+  const int mat1_K = utils::val_at(-1, mat1_sizes);
+  const int mat2_K = utils::val_at(-1, mat2_sizes) * 2;
+  const int N = utils::val_at(-2, mat2_sizes);
+
+  VK_CHECK_COND(mat1_K == mat2_K);
+
+  VK_CHECK_COND(mat2_K % group_size == 0);
+
+  const uint32_t k_groups = mat2_K / group_size;
+
+  VK_CHECK_COND(scales_and_zeros_sizes.size() == 3);
+  VK_CHECK_COND(utils::val_at(-1, scales_and_zeros_sizes) == k_groups);
+  VK_CHECK_COND(utils::val_at(-2, scales_and_zeros_sizes) == N);
+  VK_CHECK_COND(utils::val_at(-3, scales_and_zeros_sizes) == 2);
+
+  // Match https://fburl.com/code/6ostkknm
+  std::vector<uint32_t> valid_group_sizes = {32, 64, 128, 256};
+
+  bool is_valid_group_size = false;
+  for (auto valid_group_size : valid_group_sizes) {
+    if (group_size == valid_group_size) {
+      is_valid_group_size = true;
+      break;
+    }
+  }
+
+  VK_CHECK_COND(is_valid_group_size);
+}
+
+void resize_q_matmul_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]);
+  vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]);
+
+  const int out_cols = utils::val_at(-2, mat1->sizes());
+  const int out_rows = utils::val_at(-2, mat2->sizes());
+
+  std::vector<int64_t> new_out_sizes(3);
+  if (mat1->sizes().size() == 2) {
+    new_out_sizes.resize(2);
+    new_out_sizes.at(0) = out_cols;
+    new_out_sizes.at(1) = out_rows;
+  } else {
+    new_out_sizes.at(0) = mat1->sizes().at(0);
+    new_out_sizes.at(1) = out_cols;
+    new_out_sizes.at(2) = out_rows;
+  }
+
+  out->virtual_resize(new_out_sizes);
+}
+
+void add_q_matmul_node(
+    ComputeGraph& graph,
+    const ValueRef mat1,
+    const ValueRef mat2_data,
+    const ValueRef group_size,
+    const ValueRef scales_and_zeros_data,
+    const ValueRef out) {
+  auto storage_type = graph.storage_type_of(out);
+
+  ValueRef mat2 =
+      prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked);
+
+  ValueRef scales_and_zeros =
+      prepack_if_tensor_ref(graph, scales_and_zeros_data, utils::kWidthPacked);
+
+  std::string kernel_name = "q_4w_linear";
+
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+  add_storage_type_suffix(kernel_name, storage_type);
+
+  const uint32_t group_size_val = graph.extract_scalar<uint32_t>(group_size);
+
+  vkapi::ParamsBindList ubos({});
+  if (storage_type == utils::kBuffer) {
+    ubos.append(graph.sizes_ubo(out));
+    ubos.append(graph.strides_ubo(out));
+    ubos.append(graph.sizes_ubo(mat1));
+    ubos.append(graph.strides_ubo(mat1));
+    ubos.append(graph.strides_ubo(mat2));
+    ubos.append(graph.strides_ubo(scales_and_zeros));
+  } else {
+    ubos.append(graph.sizes_ubo(out));
+    ubos.append(graph.sizes_ubo(mat1));
+    ubos.append(graph.strides_ubo(mat2));
+    ubos.append(graph.strides_ubo(scales_and_zeros));
+  }
+
+  auto out_sizes = graph.sizes_of(out);
+  uint32_t N = utils::val_at(-1, out_sizes);
+  uint32_t M = utils::val_at(-2, out_sizes);
+
+  utils::uvec3 global_wg_size = {N, M, 1};
+
+  utils::uvec3 local_wg_size = adaptive_work_group_size(global_wg_size);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::MemoryAccessType::WRITE},
+       {{mat1, mat2, scales_and_zeros}, vkapi::MemoryAccessType::READ}},
+      // Shader params buffers
+      ubos,
+      // Specialization Constants
+      {SV(group_size_val)},
+      // Resizing Logic
+      resize_q_matmul_node,
+      {}));
+}
+
+void int4pack_mm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  check_q_matmul_args(graph, args[0], args[1], args[2], args[3], args[4]);
+  return add_q_matmul_node(
+      graph,
+      args[0], // mat1
+      args[1], // mat2
+      args[2], // group_size
+      args[3], // scales_and_zeros
+      args[4] // out
+  );
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten._weight_int4pack_mm.default, int4pack_mm);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
index 3ef80dc49c7..741b65a84f0 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
@@ -23,8 +23,8 @@ void check_args(
     const api::vTensor& in,
     const std::vector<int64_t>& repeats,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 
   int64_t in_dim = in.dim();
   VK_CHECK_COND(
@@ -108,7 +108,7 @@ void add_repeat_channel_node(
       // Parameter buffers
       {graph.create_params_buffer(repeat_channel_args)},
       // Specialization Constants
-      {SV(t_out->packed_dim_whcn_idx())}));
+      {SV(t_out->packed_dim())}));
 }
 
 void add_repeat_node(
@@ -130,7 +130,7 @@ void add_repeat_node(
   // After expanding a dimension, we will update the "running_range" since we
   // will need to copy the "expanded" area.
 
-  utils::ivec3 running_range = t_in->texture_limits();
+  utils::ivec3 running_range = t_in->logical_limits();
 
   const std::vector<int64_t>& in_sizes = t_in->sizes();
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Select.cpp b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
index 351db0d192b..b2f2245f648 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Select.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
@@ -22,8 +22,8 @@ void check_args(
     int64_t dim,
     int64_t index,
     const api::vTensor& t_out) {
-  VK_CHECK_COND(check_memory_layout_is(t_in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(t_out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(t_in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(t_out, WHCN::kChannelsDim));
 
   const int64_t in_dim = t_in.dim();
   VK_CHECK_COND(
@@ -112,7 +112,7 @@ void add_select_int_node(
       {{out, vkapi::MemoryAccessType::WRITE},
        {in, vkapi::MemoryAccessType::READ}},
       // Parameter buffers
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        t_out->sizes_ubo(),
        // TODO: num_batches and num_texel_per_batch are provided by
        // t_out->sizes. Can change the following to reduce params
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
index 8b323bafedd..21e6549513d 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
@@ -10,6 +10,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/Logging.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Slice.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
@@ -42,8 +44,8 @@ void add_slice_tensor_out_node(
   vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
 
-  VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(*t_out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
 
   // Need normalize the dim
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
@@ -123,7 +125,7 @@ void add_slice_tensor_out_node(
     kernel_name.reserve(kShaderNameReserve);
     add_dtype_suffix(kernel_name, *t_out);
 
-    utils::uvec3 global_size = t_out->image_extents();
+    utils::uvec3 global_size = t_out->logical_limits();
     utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
     const struct Block final {
@@ -149,6 +151,124 @@ void add_slice_tensor_out_node(
   }
 }
 
+std::vector<int64_t> get_slice_sizes(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref) {
+  const int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  std::optional<int64_t> opt_start =
+      graph.extract_optional_scalar<int64_t>(opt_start_ref);
+  std::optional<int64_t> opt_end =
+      graph.extract_optional_scalar<int64_t>(opt_end_ref);
+
+  int64_t dim_size = graph.size_at<int64_t>(dim, in_ref);
+  int64_t start = opt_start.value_or(0);
+  int64_t end = opt_end.value_or(dim_size);
+
+  start = normalize_idx(start, dim_size, 0);
+  end = normalize_idx(end, dim_size, dim_size);
+
+  std::vector<int64_t> new_out_sizes = graph.sizes_of(in_ref);
+  new_out_sizes.at(dim) = end - start;
+
+  return new_out_sizes;
+}
+
+void resize_slice_view_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)args;
+  vTensorPtr out = graph->get_tensor(extra_args[0]);
+
+  std::vector<int64_t> new_out_sizes = get_slice_sizes(
+      *graph,
+      extra_args[1], // input
+      extra_args[2], // dim
+      extra_args[3], // optional start
+      extra_args[4]); // optional end
+
+  out->virtual_resize(new_out_sizes);
+}
+
+void check_slice_view_args(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref,
+    ValueRef opt_step_ref,
+    ValueRef out_ref) {
+  VK_CHECK_COND(
+      graph.val_is_view_of(out_ref, in_ref),
+      "output must be a view of the input");
+
+  const int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  const int64_t dim_size = graph.size_at<int64_t>(dim, in_ref);
+
+  int64_t start =
+      graph.extract_optional_scalar<int64_t>(opt_start_ref).value_or(0);
+  int64_t end = graph.extract_optional_scalar<int64_t>(opt_end_ref).value_or(0);
+  int64_t step =
+      graph.extract_optional_scalar<int64_t>(opt_step_ref).value_or(1);
+
+  start = normalize_idx(start, dim_size, 0);
+  end = normalize_idx(end, dim_size, dim_size);
+
+  // The start idx must be 0; this is to ensure that the start of the slice view
+  // does not have any offset with respect to the base buffer storage. If the
+  // offset is nonzero, then it will potentially change upon a resize; however
+  // the buffer offset of the view tensor will have been "locked in" when the
+  // descriptor for its buffer storage is bound to a compute shader. Therefore
+  // there is no way to update the offset of the view once it has been bound.
+  VK_CHECK_COND(start == 0, "start must be 0 for slice view");
+  VK_CHECK_COND(step == 1, "step must be 1 for slice view");
+
+  VK_CHECK_COND(
+      end < dim_size, "end must be less than dim size for slice view");
+
+  // We must also check that all earlier dims in the dim order have a size of 1.
+  // This ensures that the slice view encompasses a contiguous memory region of
+  // the source tensor's memory buffer.
+  std::vector<int64_t> in_sizes = graph.sizes_of(in_ref);
+  std::vector<int64_t> in_dim_order = graph.dim_order_of(in_ref);
+  for (int i = 0; i < in_dim_order.size(); ++i) {
+    if (in_dim_order[i] == dim) {
+      break;
+    }
+    VK_CHECK_COND(in_sizes[in_dim_order[i]] == 1);
+  }
+}
+
+void add_slice_view_node(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref,
+    ValueRef opt_step_ref,
+    ValueRef out_ref) {
+  check_slice_view_args(
+      graph,
+      in_ref,
+      dim_ref,
+      opt_start_ref,
+      opt_end_ref,
+      opt_step_ref,
+      out_ref);
+
+  std::vector<int64_t> new_out_sizes =
+      get_slice_sizes(graph, in_ref, dim_ref, opt_start_ref, opt_end_ref);
+
+  graph.get_tensor(out_ref)->virtual_resize(new_out_sizes);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      resize_slice_view_node,
+      {out_ref, in_ref, dim_ref, opt_start_ref, opt_end_ref, opt_step_ref}));
+}
+
 void slice_tensor_out(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   return add_slice_tensor_out_node(
       graph,
@@ -160,9 +280,36 @@ void slice_tensor_out(ComputeGraph& graph, const std::vector<ValueRef>& args) {
       args[5]);
 }
 
+void slice_tensor(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  ValueRef in = args[0];
+  ValueRef out = args[5];
+
+  // Special case if out is a view of in
+  if (graph.val_is_view_of(out, in)) {
+    add_slice_view_node(
+        graph,
+        in,
+        args[1], // dim
+        args[2], // optional start
+        args[3], // optional end
+        args[4], // step
+        out);
+    return;
+  }
+
+  add_slice_tensor_out_node(
+      graph,
+      in,
+      args[1], // dim
+      args[2], // optional start
+      args[3], // optional end
+      args[4], // step
+      out);
+}
+
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.slice_copy.Tensor, slice_tensor_out);
-  VK_REGISTER_OP(aten.slice.Tensor, slice_tensor_out);
+  VK_REGISTER_OP(aten.slice.Tensor, slice_tensor);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.h b/backends/vulkan/runtime/graph/ops/impl/Slice.h
new file mode 100644
index 00000000000..220066ff1bb
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Slice.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+#include <vector>
+
+namespace vkcompute {
+
+void add_slice_view_node(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref,
+    ValueRef opt_step_ref,
+    ValueRef out_ref);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
index fa4d3df944f..dd2fb43e656 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
@@ -64,7 +64,7 @@ void add_softmax_node(
       {{out, vkapi::MemoryAccessType::WRITE},
        {in_arg, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        t_in->sizes_ubo(),
        graph.create_params_buffer(utils::make_ivec2({in_dim, softmax_dim}))},
       // Specialization Constants
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
index e093ccf1b72..39039e51025 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -25,7 +25,7 @@ void add_split_with_sizes_default_node(
     ValueRef out_list_ref) {
   vTensorPtr t_in = graph.get_tensor(in);
 
-  VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
 
   ValueListPtr out_list = graph.get_value_list(out_list_ref);
 
@@ -38,7 +38,7 @@ void add_split_with_sizes_default_node(
     ValueRef out_ref = (*out_list)[split_idx];
 
     vTensorPtr t_out = graph.get_tensor(out_ref);
-    VK_CHECK_COND(check_memory_layout_is(*t_out, utils::kChannelsPacked));
+    VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
     VK_CHECK_COND(dim_at(*t_out, dim_index) == split_size);
   }
 
@@ -50,7 +50,7 @@ void add_split_with_sizes_default_node(
       // Doesn't need to use split_size since we have already verified that the
       // output tensor's size matches with the split_size.
       vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->texture_limits();
+      utils::ivec3 range = t_out->logical_limits();
       add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
 
       src_offset[0] += range[0];
@@ -61,7 +61,7 @@ void add_split_with_sizes_default_node(
 
     for (ValueRef out_ref : *out_list) {
       vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->texture_limits();
+      utils::ivec3 range = t_out->logical_limits();
       add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
 
       src_offset[1] += range[1];
@@ -72,7 +72,7 @@ void add_split_with_sizes_default_node(
 
     for (ValueRef out_ref : *out_list) {
       vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->texture_limits();
+      utils::ivec3 range = t_out->logical_limits();
       add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
 
       src_offset[2] += range[2];
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index b02613c2083..ef6e8347df8 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -31,7 +31,7 @@ void add_staging_to_tensor_node(
          graph.strides_ubo(out_tensor),
          graph.numel_ubo(out_tensor)});
   } else {
-    ubos.append(graph.sizes_ubo(out_tensor));
+    ubos.append({graph.sizes_ubo(out_tensor), graph.axis_map_ubo(out_tensor)});
   }
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -45,7 +45,7 @@ void add_staging_to_tensor_node(
       // Parameter Buffers
       ubos,
       // Specialization Constants
-      {SV(graph.packed_dim_whcn_idx_of(out_tensor))},
+      {SV(graph.packed_dim_of(out_tensor))},
       // Resizing Logic
       nullptr,
       {}));
@@ -69,7 +69,7 @@ void add_tensor_to_staging_node(
          graph.strides_ubo(in_tensor),
          graph.numel_ubo(in_tensor)});
   } else {
-    ubos.append(graph.sizes_ubo(in_tensor));
+    ubos.append({graph.sizes_ubo(in_tensor), graph.axis_map_ubo(in_tensor)});
   }
 
   // Normally, the image_to_nchw shader is structured so that each thread reads
@@ -97,7 +97,7 @@ void add_tensor_to_staging_node(
       // Parameter Buffers
       ubos,
       // Specialization Constants
-      {SV(graph.packed_dim_whcn_idx_of(in_tensor))}));
+      {SV(graph.packed_dim_of(in_tensor))}));
 }
 
 ValueRef prepack(
@@ -113,7 +113,7 @@ ValueRef prepack(
   if (graph.is_buffer_storage(v)) {
     ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)});
   } else {
-    ubos.append(graph.sizes_ubo(v));
+    ubos.append({graph.sizes_ubo(v), graph.axis_map_ubo(v)});
   }
 
   graph.prepack_nodes().emplace_back(new PrepackNode(
@@ -127,7 +127,34 @@ ValueRef prepack(
       // Parameter Buffers
       ubos,
       // Specialization Constants
-      {SV(graph.packed_dim_whcn_idx_of(v))}));
+      {SV(graph.packed_dim_of(v))}));
+
+  return v;
+}
+
+ValueRef prepack_buffer(
+    ComputeGraph& graph,
+    const ValueRef vref,
+    const utils::GPUMemoryLayout layout) {
+  ValueRef v = graph.add_tensor_like(vref, utils::kBuffer, layout);
+
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR("buffer_to_buffer");
+
+  vkapi::ParamsBindList ubos;
+  ubos.append({graph.numel_ubo(v)});
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      shader,
+      graph.create_global_wg_size(v),
+      graph.create_local_wg_size(v),
+      // Input and Outputs
+      vref,
+      v,
+      // Parameter Buffers
+      ubos,
+      // Specialization Constants
+      {}));
 
   return v;
 }
@@ -143,6 +170,17 @@ ValueRef prepack_if_tensor_ref(
   }
 }
 
+ValueRef prepack_buffer_if_tensor_ref(
+    ComputeGraph& graph,
+    const ValueRef v,
+    const utils::GPUMemoryLayout layout) {
+  if (graph.val_is_tref(v)) {
+    return prepack_buffer(graph, v, layout);
+  } else {
+    return v;
+  }
+}
+
 ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) {
   if (graph.val_is_tref(v)) {
     utils::GPUMemoryLayout layout =
@@ -153,4 +191,14 @@ ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) {
   }
 }
 
+ValueRef prepack_buffer_if_tensor_ref(ComputeGraph& graph, const ValueRef v) {
+  if (graph.val_is_tref(v)) {
+    utils::GPUMemoryLayout layout =
+        graph.suggested_memory_layout(graph.get_tref(v)->sizes);
+    return prepack_buffer(graph, v, layout);
+  } else {
+    return v;
+  }
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h
index fc875de80dd..88a9630239a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.h
@@ -29,6 +29,13 @@ ValueRef prepack_if_tensor_ref(
     const ValueRef v,
     const utils::GPUMemoryLayout layout);
 
+ValueRef prepack_buffer_if_tensor_ref(
+    ComputeGraph& graph,
+    const ValueRef v,
+    const utils::GPUMemoryLayout layout);
+
 ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v);
 
+ValueRef prepack_buffer_if_tensor_ref(ComputeGraph& graph, const ValueRef v);
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
index b466f404ad1..c0ce9e4f2c4 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
@@ -47,8 +47,8 @@ void resize_sum_node(
 }
 
 void check_sum_args(const api::vTensor& in, const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
 
 void add_sum_dim_node(
@@ -85,7 +85,7 @@ void add_sum_dim_node(
       {{out, vkapi::MemoryAccessType::WRITE},
        {arg, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        graph.create_params_buffer(dim + 4 - in_dim),
        graph.create_params_buffer(dim_size),
        graph.create_params_buffer(int(ceil(channel / 4.0)))},
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp b/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp
index faa99ec1a18..98a104b4b70 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp
@@ -10,45 +10,14 @@
 
 #include <executorch/backends/vulkan/runtime/graph/Logging.h>
 
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transpose.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 #include <algorithm>
 
-#include <iostream>
-
 namespace vkcompute {
 
-/*
- * Transposing for sizes and strides is as simple as swapping the values at
- * dim0 and dim1 in the sizes/strides vector.
- */
-void swap_vector_inplace(
-    std::vector<int64_t>& vec,
-    const int64_t dim0,
-    const int64_t dim1) {
-  std::iter_swap(vec.begin() + dim0, vec.begin() + dim1);
-}
-
-/*
- * Transposing the dim order is a bit more unintuitive. dim0 and dim1 have
- * swapped their "identities", so we need to swap the values of dim0 and dim1
- * wherever they appear in the dim order vector. Compare this to just swapping
- * the elements at dim0 and dim1 in the strides or sizes vectors.
- */
-void transpose_dim_order_inplace(
-    std::vector<int64_t>& dim_order,
-    const int64_t dim0,
-    const int64_t dim1) {
-  for (int i = 0; i < dim_order.size(); ++i) {
-    if (dim_order[i] == dim0) {
-      dim_order[i] = dim1;
-    } else if (dim_order[i] == dim1) {
-      dim_order[i] = dim0;
-    }
-  }
-}
-
 void resize_transpose_view_node(
     ComputeGraph* graph,
     const std::vector<ArgGroup>& args,
@@ -61,12 +30,9 @@ void resize_transpose_view_node(
   const int64_t dim1 = graph->extract_scalar<int64_t>(extra_args[3]);
 
   std::vector<int64_t> new_sizes = in->sizes();
-  std::vector<int64_t> new_dim_order = in->dim_order();
-
-  swap_vector_inplace(new_sizes, dim0, dim1);
-  transpose_dim_order_inplace(new_dim_order, dim0, dim1);
-
-  out->virtual_reconfigure(new_sizes, new_dim_order);
+  // Transpose the resized input sizes
+  std::iter_swap(new_sizes.begin() + dim0, new_sizes.begin() + dim1);
+  out->virtual_resize(new_sizes);
 }
 
 void check_transpose_view_args(
@@ -95,13 +61,8 @@ void add_transpose_view_node(
   const int64_t dim0 = graph.extract_scalar<int64_t>(dim0_ref);
   const int64_t dim1 = graph.extract_scalar<int64_t>(dim1_ref);
 
-  std::vector<int64_t> new_sizes = graph.sizes_of(input_ref);
-  std::vector<int64_t> new_dim_order = graph.dim_order_of(input_ref);
-
-  swap_vector_inplace(new_sizes, dim0, dim1);
-  transpose_dim_order_inplace(new_dim_order, dim0, dim1);
-
-  graph.get_tensor(out_ref)->virtual_reconfigure(new_sizes, new_dim_order);
+  check_transpose_view_args(graph, input_ref, dim0, dim1, out_ref);
+  graph.get_tensor(out_ref)->virtual_transpose(dim0, dim1);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       resize_transpose_view_node, {out_ref, input_ref, dim0_ref, dim1_ref}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transpose.h b/backends/vulkan/runtime/graph/ops/impl/Transpose.h
new file mode 100644
index 00000000000..a4fc4029222
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Transpose.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+#include <vector>
+
+namespace vkcompute {
+
+void add_transpose_view_node(
+    ComputeGraph& graph,
+    ValueRef input_ref,
+    ValueRef dim0_ref,
+    ValueRef dim1_ref,
+    ValueRef out_ref);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
index 075c0bc923a..ea27183ead0 100644
--- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
@@ -46,7 +46,7 @@ void add_unary_op_node(
   if (graph.is_buffer_storage(out)) {
     ubos.append({graph.numel_ubo(out)});
   } else {
-    ubos.append({graph.texture_limits_ubo(out)});
+    ubos.append({graph.logical_limits_ubo(out)});
   }
   ubos.append(
       {graph.create_params_buffer(min), graph.create_params_buffer(max)});
@@ -114,12 +114,6 @@ float get_val_or_inf(ComputeGraph& graph, const ValueRef& val, bool max) {
         "hardshrink");                                                   \
   }
 
-#define DEFINE_HARDSWISH_FN(op_name)                                     \
-  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    return add_unary_op_node(                                            \
-        graph, args[0], kDummyFloat, kDummyFloat, args[1], #op_name);    \
-  }
-
 void gelu(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   // args[1] is the `approximate` string
   // https://fburl.com/code/9omngmyo
@@ -140,7 +134,8 @@ DEFINE_CLAMP_FN(clamp);
 DEFINE_CLAMP_FN(hardtanh);
 DEFINE_RELU_FN(relu);
 DEFINE_HARDSHRINK_FN(hardshrink);
-DEFINE_HARDSWISH_FN(hardswish);
+DEFINE_ACTIVATION_FN(hardswish);
+DEFINE_ACTIVATION_FN(hardsigmoid);
 
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.abs.default, abs);
@@ -157,6 +152,7 @@ REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.tanh.default, tanh);
   VK_REGISTER_OP(aten.hardshrink.default, hardshrink);
   VK_REGISTER_OP(aten.hardswish.default, hardswish);
+  VK_REGISTER_OP(aten.hardsigmoid.default, hardsigmoid);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp
index 9183f2aea80..f7fe5282e02 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp
@@ -66,7 +66,7 @@ void add_upsample_nearest2d_node(
   ValueRef arg_in = prepack_if_tensor_ref(graph, in);
 
   vTensorPtr t_in = graph.get_tensor(in);
-  utils::uvec3 input_sizes = t_in->image_extents();
+  utils::uvec3 input_sizes = t_in->logical_limits();
 
   utils::ivec2 input_size = {
       utils::safe_downcast<int32_t>(input_sizes[0]),
@@ -105,7 +105,7 @@ void add_upsample_nearest2d_node(
       {{out, vkapi::MemoryAccessType::WRITE},
        {arg_in, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        graph.create_params_buffer(input_size),
        graph.create_params_buffer(rev_scales)},
       // Specialization Constants
diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp
index 507dbdcf8b1..4832c16ab99 100644
--- a/backends/vulkan/runtime/graph/ops/impl/View.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/View.cpp
@@ -76,7 +76,7 @@ void add_view_node(
       // Parameter Buffers
       {t_out->sizes_ubo(), t_in->sizes_ubo()},
       // Specialization Constants
-      {SV(t_in->packed_dim_whcn_idx()), SV(t_out->packed_dim_whcn_idx())},
+      {SV(t_in->packed_dim()), SV(t_out->packed_dim())},
       // Resizing Logic
       resize_view_node,
       {sizes}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
index 45dfceb3f0d..4bd8e9b900b 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
@@ -32,7 +32,8 @@ constexpr DimIndex kChannel4D = DimIndex::DIM_3RD_LAST;
 constexpr DimIndex kBatch4D = DimIndex::DIM_4TH_LAST;
 
 inline DimIndex normalize_to_dim_index(const api::vTensor& v_in, int32_t dim) {
-  return static_cast<DimIndex>(dim - v_in.dim());
+  return dim < 0 ? static_cast<DimIndex>(dim)
+                 : static_cast<DimIndex>(dim - v_in.dim());
 }
 
 /*
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp
new file mode 100644
index 00000000000..4cf678a9dcb
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h>
+
+namespace vkcompute {
+
+void pack4(const uint8_t* w_ptr, uint8_t* b_ptr, uint32_t N, uint32_t K) {
+  for (int32_t n = 0; n < N; n++) {
+    for (int32_t k2 = 0; k2 < K / 2; k2++) {
+      uint8_t src_val0 = w_ptr[n * K + k2 * 2];
+      uint8_t src_val1 = w_ptr[n * K + k2 * 2 + 1];
+      b_ptr[n * (K / 2) + k2] = (uint8_t(src_val1) << 4) | uint8_t(src_val0);
+    }
+  }
+}
+
+std::vector<uint8_t> int4mm_pack_weights(
+    const std::vector<int64_t>& W_sizes,
+    const uint8_t* w_ptr) {
+  const int32_t N = utils::val_at(-1, W_sizes);
+  const int32_t K = utils::val_at(-2, W_sizes);
+
+  const auto numel = K * N;
+  std::vector<uint8_t> w_ptr_T(numel);
+  std::vector<uint8_t> b_ptr(utils::div_up(numel, 2));
+
+  // Transpose the weights
+  for (int32_t k = 0; k < K; k++) {
+    for (int32_t n = 0; n < N; n++) {
+      w_ptr_T[n * K + k] = w_ptr[k * N + n];
+    }
+  }
+
+  // Pack two int4s into each int8
+  pack4(w_ptr_T.data(), b_ptr.data(), N, K);
+
+  return b_ptr;
+}
+
+std::vector<float> int4mm_dequantize_weights(
+    const std::vector<int64_t>& W_sizes,
+    const uint8_t* w_ptr,
+    const uint32_t group_size,
+    const float* scales_and_zeros) {
+  const int64_t N = utils::val_at(-1, W_sizes);
+  const int64_t K = utils::val_at(-2, W_sizes);
+
+  std::vector<float> w_ptr_deq(K * N);
+  const int k_groups = K / group_size;
+  const int zeros_stride = k_groups * N;
+
+  for (int k = 0; k < K; k++) {
+    for (int n = 0; n < N; n++) {
+      const int kb = k / group_size;
+      const int scale_idx = k_groups * n + kb;
+      const float scale = scales_and_zeros[scale_idx];
+      const float zero =
+          scales_and_zeros[scale_idx + zeros_stride] - scale * 8.0;
+      w_ptr_deq[k * N + n] = w_ptr[k * N + n] * scale + zero;
+    }
+  }
+
+  return w_ptr_deq;
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h
new file mode 100644
index 00000000000..4c4cf26d504
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+std::vector<uint8_t> int4mm_pack_weights(
+    const std::vector<int64_t>& W_sizes,
+    const uint8_t* w_ptr);
+
+std::vector<float> int4mm_dequantize_weights(
+    const std::vector<int64_t>& W_sizes,
+    const uint8_t* w_ptr,
+    const uint32_t group_size,
+    const float* scales_and_zeros);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
index 2737a86a1ab..9d010c794ec 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
@@ -45,28 +45,26 @@ bool check_same_sizes_at(
   return utils::val_at(d1, t1.sizes()) == utils::val_at(d2, t2.sizes());
 }
 
-bool check_memory_layout_is(
-    const api::vTensor& t,
-    utils::GPUMemoryLayout layout) {
-  return t.gpu_memory_layout() == layout;
+bool check_packed_dim_is(const api::vTensor& t, const int32_t packed_dim) {
+  return t.packed_dim() == packed_dim;
 }
 
 bool check_same_ndim(const api::vTensor& t1, const api::vTensor& t2) {
   return t1.sizes().size() == t2.sizes().size();
 }
 
-bool check_same_memory_layout(const api::vTensor& t1, const api::vTensor& t2) {
-  return t1.gpu_memory_layout() == t2.gpu_memory_layout();
+bool check_same_packed_dim(const api::vTensor& t1, const api::vTensor& t2) {
+  return t1.packed_dim() == t2.packed_dim();
 }
 
-bool check_same_memory_layout(
+bool check_same_packed_dim(
     const api::vTensor& t1,
     const api::vTensor& t2,
     const api::vTensor& t3) {
-  if (t1.gpu_memory_layout() != t2.gpu_memory_layout()) {
+  if (t1.packed_dim() != t2.packed_dim()) {
     return false;
   }
-  return (t1.gpu_memory_layout() == t3.gpu_memory_layout());
+  return (t1.packed_dim() == t3.packed_dim());
 }
 
 //
@@ -78,13 +76,15 @@ bool is_packed_dim_broadcasted(
     const api::vTensor& rcvr) {
   // We assume that the tensors are broadcastable. If values aren't equal at
   // some index, then the value of rcvr is 1 and hence should be broadcasted.
-  switch (sndr.gpu_memory_layout()) {
-    case utils::kChannelsPacked:
+  switch (sndr.packed_dim()) {
+    case WHCN::kChannelsDim:
       return utils::val_at(-3, sndr.sizes()) > utils::val_at(-3, rcvr.sizes());
-    case utils::kHeightPacked:
+    case WHCN::kHeightDim:
       return utils::val_at(-2, sndr.sizes()) > utils::val_at(-2, rcvr.sizes());
-    case utils::kWidthPacked:
+    case WHCN::kWidthDim:
       return utils::val_at(-1, sndr.sizes()) > utils::val_at(-1, rcvr.sizes());
+    default:
+      VK_THROW("Invalid packed dim");
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
index 44155a7ce62..754cc551d0e 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
@@ -34,13 +34,11 @@ bool check_same_sizes_at(
     const api::vTensor& t2,
     int64_t d2);
 
-bool check_memory_layout_is(
-    const api::vTensor& t,
-    utils::GPUMemoryLayout layout);
+bool check_packed_dim_is(const api::vTensor& t, const int32_t packed_dim);
 
-bool check_same_memory_layout(const api::vTensor& t1, const api::vTensor& t2);
+bool check_same_packed_dim(const api::vTensor& t1, const api::vTensor& t2);
 
-bool check_same_memory_layout(
+bool check_same_packed_dim(
     const api::vTensor& t1,
     const api::vTensor& t2,
     const api::vTensor& t3);
diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
index b0964ace225..2cfb34a052e 100644
--- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
@@ -66,7 +66,7 @@ uint32_t bind_params_to_descriptor_set(
 }
 
 void bind_staging_to_descriptor_set(
-    api::StorageBuffer& staging,
+    api::StagingBuffer& staging,
     vkapi::DescriptorSet& descriptor_set,
     const uint32_t idx) {
   descriptor_set.bind(idx, staging.buffer());
diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
index 3a7ec029da7..eed39a97979 100644
--- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
@@ -40,7 +40,7 @@ uint32_t bind_params_to_descriptor_set(
     const uint32_t base_idx);
 
 void bind_staging_to_descriptor_set(
-    api::StorageBuffer& staging,
+    api::StagingBuffer& staging,
     vkapi::DescriptorSet& descriptor_set,
     const uint32_t idx);
 
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
index 89f542de6fc..81d5c9e98af 100644
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
@@ -69,28 +69,26 @@ void add_ndim_suffix(std::string& kernel_name, const api::vTensor& tensor) {
   }
 }
 
-void add_memory_layout_suffix(
-    std::string& kernel_name,
-    utils::GPUMemoryLayout layout) {
-  switch (layout) {
-    case utils::kChannelsPacked:
-      kernel_name += "_C_packed";
+void add_packed_dim_suffix(std::string& kernel_name, const int32_t packed_dim) {
+  switch (packed_dim) {
+    case WHCN::kWidthDim:
+      kernel_name += "_W_packed";
       break;
-    case utils::kHeightPacked:
+    case WHCN::kHeightDim:
       kernel_name += "_H_packed";
       break;
-    case utils::kWidthPacked:
-      kernel_name += "_W_packed";
+    case WHCN::kChannelsDim:
+      kernel_name += "_C_packed";
       break;
     default:
-      break;
+      VK_THROW("Invalid packed dim!");
   }
 }
 
-void add_memory_layout_suffix(
+void add_packed_dim_suffix(
     std::string& kernel_name,
     const api::vTensor& tensor) {
-  return add_memory_layout_suffix(kernel_name, tensor.gpu_memory_layout());
+  return add_packed_dim_suffix(kernel_name, tensor.packed_dim());
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
index e8f4f0d229e..10084054964 100644
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
@@ -29,10 +29,8 @@ void add_dtype_suffix(std::string& kernel_name, const api::vTensor& tensor);
 void add_ndim_suffix(std::string& kernel_name, const size_t ndim);
 void add_ndim_suffix(std::string& kernel_name, const api::vTensor& tensor);
 
-void add_memory_layout_suffix(
-    std::string& kernel_name,
-    const utils::GPUMemoryLayout layout);
-void add_memory_layout_suffix(
+void add_packed_dim_suffix(std::string& kernel_name, const int32_t packed_dim);
+void add_packed_dim_suffix(
     std::string& kernel_name,
     const api::vTensor& tensor);
 
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
index 294e36b9a86..8804bcf2ef6 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -13,88 +13,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
 
-#include <cstring>
-
 namespace vkcompute {
 
-template <typename T>
-void memcpy_to_mapping_impl(
-    const void* src,
-    vkapi::MemoryMap& dst_mapping,
-    const size_t nbytes) {
-  T* data_ptr = dst_mapping.template data<T>();
-  memcpy(data_ptr, reinterpret_cast<const T*>(src), nbytes);
-}
-
-template <typename T>
-void memcpy_from_mapping_impl(
-    vkapi::MemoryMap& src_mapping,
-    void* dst,
-    const size_t nbytes) {
-  T* data_ptr = src_mapping.template data<T>();
-  memcpy(reinterpret_cast<T*>(dst), data_ptr, nbytes);
-}
-
-void memcpy_to_mapping(
-    const void* src,
-    vkapi::MemoryMap& dst_mapping,
-    const size_t nbytes,
-    const vkapi::ScalarType dtype) {
-#define DTYPE_CASE(ctype, vkformat, name)                    \
-  case vkapi::ScalarType::name:                              \
-    memcpy_to_mapping_impl<ctype>(src, dst_mapping, nbytes); \
-    break;
-
-  switch (dtype) {
-    VK_FORALL_SCALAR_TYPES(DTYPE_CASE)
-    default:
-      VK_THROW("Unrecognized dtype!");
-  }
-#undef DTYPE_CASE
-}
-
-void memcpy_from_mapping(
-    vkapi::MemoryMap& src_mapping,
-    void* dst,
-    const size_t nbytes,
-    const vkapi::ScalarType dtype) {
-#define DTYPE_CASE(ctype, vkformat, name)                      \
-  case vkapi::ScalarType::name:                                \
-    memcpy_from_mapping_impl<ctype>(src_mapping, dst, nbytes); \
-    break;
-
-  switch (dtype) {
-    VK_FORALL_SCALAR_TYPES(DTYPE_CASE)
-    default:
-      VK_THROW("Unrecognized dtype!");
-  }
-#undef DTYPE_CASE
-}
-
-void copy_ptr_to_staging(
-    const void* src,
-    api::StorageBuffer& staging,
-    const size_t nbytes) {
-  vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE);
-  mapping.invalidate();
-  memcpy_to_mapping(src, mapping, nbytes, staging.dtype());
-}
-
-void copy_staging_to_ptr(
-    api::StorageBuffer& staging,
-    void* dst,
-    const size_t nbytes) {
-  vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::READ);
-  mapping.invalidate();
-  memcpy_from_mapping(mapping, dst, nbytes, staging.dtype());
-}
-
-void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes) {
-  vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE);
-  uint8_t* data_ptr = mapping.template data<uint8_t>();
-  memset(data_ptr, 0, staging.nbytes());
-}
-
 vkapi::ShaderInfo get_nchw_to_tensor_shader(
     const api::vTensor& v_dst,
     const bool int8_buffer_enabled) {
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
index cabc17f30ee..8d63958a738 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
@@ -12,25 +12,6 @@
 
 namespace vkcompute {
 
-//
-// Functions to copy data into and out of a staging buffer
-//
-
-void copy_ptr_to_staging(
-    const void* src,
-    api::StorageBuffer& staging,
-    const size_t nbytes);
-void copy_staging_to_ptr(
-    api::StorageBuffer& staging,
-    void* dst,
-    const size_t nbytes);
-
-void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes);
-
-//
-// Functions to get shaders
-//
-
 vkapi::ShaderInfo get_nchw_to_tensor_shader(
     const api::vTensor& v_dst,
     bool int8_buffer_enabled = true);
diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h
index 3cd60e25fd2..5ada8df8af7 100644
--- a/backends/vulkan/runtime/utils/StorageUtils.h
+++ b/backends/vulkan/runtime/utils/StorageUtils.h
@@ -8,7 +8,19 @@
 
 #pragma once
 
+#include <ostream>
+
 namespace vkcompute {
+
+// Convenience constexpr to attach semantic names to WHCN dimension index
+namespace WHCN {
+
+constexpr int32_t kWidthDim = 0;
+constexpr int32_t kHeightDim = 1;
+constexpr int32_t kChannelsDim = 2;
+
+} // namespace WHCN
+
 namespace utils {
 
 //
@@ -36,20 +48,42 @@ static constexpr StorageType kTexture3D = StorageType::TEXTURE_3D;
 static constexpr StorageType kTexture2D = StorageType::TEXTURE_2D;
 
 /*
- * The enum below is used to describe how tensor data is laid out when stored in
- * GPU memory; specifically, it indicates how tensor data is packed along a
- * texel (i.e. a vector of 4 scalar values).
+ * A tensor's memory layout is defined in one of two ways:
+ *
+ * 1. If it's a buffer backed tensor, the memory layout is defined by its
+ *    `dim_order`, and by extension its `strides`.
+ * 2. If it's a texture backed tensor, the memory layout is defined by the
+ *    combination of its `axis_map` and its `packed_dim`.
  *
- * Each enum entry indicates which tensor dimension is packed along a texel, and
- * it's value is set to the index of that dimension in WHCN dimension order. For
- * instance, the width dimension corresponds to index 0, so the
- * TENSOR_WIDTH_PACKED enum entry is set to 0.
+ * Providing explicit memory layout metadata upon tensor construction is not
+ * very convenient from an API perspective, so the `GPUMemoryLayout` serves as
+ * an abstraction that is used to determine how to initialize a tensor's layout
+ * metadata based on the developer's intent. A `GPUMemoryLayout` is provided to
+ * the constructor of `vTensor`, which will use it to determine how to set its
+ * `dim_order` if it's a buffer backed tensor, or how to set its `axis_map` and
+ * `packed_dim` if it's a texture backed tensor.
  *
- * When interpreted as an integer, the enum value can be used as a dim index
- * representing the packed dimension. This is used in shaders to resolve tensor
- * indexing calculations.
+ * Note that GPUMemoryLayout is not stored as a tensor property, as it does not
+ * have any meaning after the vTensor is constructed. After construction,
+ * methods such as `virtual_transpose()` may be used to modify the tensor's
+ * layout metadata that cannot be represented by any `GPUMemoryLayout` entry.
+ * Nonetheless, a "best guess" of the closest memory layout can be produced via
+ * the `estimate_memory_layout()` API of `vTensor`.
+ *
+ * Currently, only 3 memory layouts are provided, but more will be added in the
+ * future that will enable different functionality such as minimizing texture
+ * memory footprint.
  */
 enum class GPUMemoryLayout : uint8_t {
+  /*
+   * The below memory layouts will produce a `vTensor` with the following
+   * properties:
+   *
+   * 1. For buffer backed tensors, the `dim_order` will be the same as a
+   *    contiguous dim order, but with the specified dim last in the dim order.
+   * 2. For texture backed tensors, the packed dim will be the specified dim.
+   *    The axis map will be `{0, 1, 2, 2}`.
+   */
   TENSOR_WIDTH_PACKED = 0u,
   TENSOR_HEIGHT_PACKED = 1u,
   TENSOR_CHANNELS_PACKED = 2u,
@@ -64,14 +98,35 @@ static constexpr GPUMemoryLayout kHeightPacked =
 static constexpr GPUMemoryLayout kChannelsPacked =
     GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
 
-/*
- * Given a GPUMemoryLayout, return an offset that can be used to determine the
- * index of the dimension that is packed along texels, assuming NCHW dimension
- * order. The index of the packed dimension will be ndim - offset.
- */
 template <typename T>
-T to_packed_dim_nchw_offset(const GPUMemoryLayout layout) {
-  return static_cast<T>(layout) + 1;
+T to_packed_dim(const GPUMemoryLayout layout) {
+  switch (layout) {
+    case kWidthPacked:
+      return 0;
+    case kHeightPacked:
+      return 1;
+    case kChannelsPacked:
+      return 2;
+  };
+  // Should be unreachable
+  return 0;
+}
+
+inline std::ostream& operator<<(
+    std::ostream& os,
+    const GPUMemoryLayout layout) {
+  switch (layout) {
+    case kWidthPacked:
+      os << "TENSOR_WIDTH_PACKED";
+      break;
+    case kHeightPacked:
+      os << "TENSOR_HEIGHT_PACKED";
+      break;
+    case kChannelsPacked:
+      os << "TENSOR_CHANNELS_PACKED";
+      break;
+  }
+  return os;
 }
 
 } // namespace utils
diff --git a/backends/vulkan/runtime/utils/VecUtils.h b/backends/vulkan/runtime/utils/VecUtils.h
index 55bb0f7d1b5..ad4434cf5af 100644
--- a/backends/vulkan/runtime/utils/VecUtils.h
+++ b/backends/vulkan/runtime/utils/VecUtils.h
@@ -238,6 +238,28 @@ struct vec final {
   // NOLINTNEXTLINE
   Type data[N];
 
+  vec() = default;
+
+  // Standard constructor with initializer list
+  vec(std::initializer_list<Type> values) {
+    VK_CHECK_COND(values.size() == N);
+    std::copy(values.begin(), values.end(), data);
+  }
+
+  // Conversion constructor from an _integral_ vec type. Note that this is only
+  // defined if `OtherType` is an integral type to disallow implicit narrowing.
+  template <
+      typename OtherType,
+      typename std::enable_if<
+          !std::is_same<Type, OtherType>::value &&
+              std::is_integral<OtherType>::value,
+          int>::type = 0>
+  /* implicit */ vec(const vec<OtherType, N>& other) {
+    for (int i = 0; i < N; ++i) {
+      data[i] = safe_downcast<Type>(other[i]);
+    }
+  }
+
   const Type& operator[](const uint32_t& i) const {
     VK_CHECK_COND(i >= 0 && i < N, "Index out of bounds!");
     return data[i];
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp
index b07bb2862d3..908feb0d3fc 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp
@@ -26,36 +26,24 @@ namespace vkcompute {
 namespace vkapi {
 
 Allocation::Allocation()
-    : memory_requirements{},
-      create_info{},
-      allocator(VK_NULL_HANDLE),
-      allocation(VK_NULL_HANDLE),
-      is_copy_(false) {}
+    : allocator(VK_NULL_HANDLE), allocation(VK_NULL_HANDLE), is_copy_(false) {}
 
 Allocation::Allocation(
     VmaAllocator vma_allocator,
     const VkMemoryRequirements& mem_props,
     const VmaAllocationCreateInfo& create_info)
-    : memory_requirements(mem_props),
-      create_info(create_info),
-      allocator(vma_allocator),
-      allocation(VK_NULL_HANDLE),
-      is_copy_(false) {
+    : allocator(vma_allocator), allocation(VK_NULL_HANDLE), is_copy_(false) {
   VK_CHECK(vmaAllocateMemory(
-      allocator, &memory_requirements, &create_info, &allocation, nullptr));
+      allocator, &mem_props, &create_info, &allocation, nullptr));
 }
 
 Allocation::Allocation(const Allocation& other) noexcept
-    : memory_requirements(other.memory_requirements),
-      create_info(other.create_info),
-      allocator(other.allocator),
+    : allocator(other.allocator),
       allocation(other.allocation),
       is_copy_(true) {}
 
 Allocation::Allocation(Allocation&& other) noexcept
-    : memory_requirements(other.memory_requirements),
-      create_info(other.create_info),
-      allocator(other.allocator),
+    : allocator(other.allocator),
       allocation(other.allocation),
       is_copy_(other.is_copy_) {
   other.allocation = VK_NULL_HANDLE;
@@ -64,8 +52,6 @@ Allocation::Allocation(Allocation&& other) noexcept
 Allocation& Allocation::operator=(Allocation&& other) noexcept {
   VmaAllocation tmp_allocation = allocation;
 
-  memory_requirements = other.memory_requirements;
-  create_info = other.create_info;
   allocator = other.allocator;
   allocation = other.allocation;
   is_copy_ = other.is_copy_;
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.h b/backends/vulkan/runtime/vk_api/memory/Allocation.h
index cec6f61e766..e56605e14b2 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocation.h
+++ b/backends/vulkan/runtime/vk_api/memory/Allocation.h
@@ -55,9 +55,6 @@ struct Allocation final {
 
   ~Allocation();
 
-  VkMemoryRequirements memory_requirements;
-  // The properties this allocation was created with
-  VmaAllocationCreateInfo create_info;
   // The allocator object this was allocated from
   VmaAllocator allocator;
   // Handles to the allocated memory
@@ -78,6 +75,7 @@ struct Allocation final {
   }
 
   friend class VulkanBuffer;
+  friend class VulkanImage;
 };
 
 } // namespace vkapi
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
index 1dadca27a0b..6533f061649 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
@@ -58,6 +58,13 @@ Allocator::~Allocator() {
   vmaDestroyAllocator(allocator_);
 }
 
+VmaAllocationCreateInfo Allocator::gpuonly_resource_create_info() {
+  VmaAllocationCreateInfo alloc_create_info = {};
+  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+  return alloc_create_info;
+}
+
 Allocation Allocator::create_allocation(
     const VkMemoryRequirements& memory_requirements,
     const VmaAllocationCreateInfo& create_info) {
@@ -103,9 +110,7 @@ VulkanImage Allocator::create_image(
         (VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
   }
 
-  VmaAllocationCreateInfo alloc_create_info = {};
-  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
-  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+  VmaAllocationCreateInfo alloc_create_info = gpuonly_resource_create_info();
 
   const VulkanImage::ImageProperties image_props{
       image_type,
@@ -132,45 +137,34 @@ VulkanImage Allocator::create_image(
       allocate_memory);
 }
 
-VulkanBuffer Allocator::create_storage_buffer(
-    const VkDeviceSize size,
-    const bool gpu_only,
-    const bool allocate_memory) {
+VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) {
   const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
 
   VmaAllocationCreateInfo alloc_create_info = {};
   alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
   alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
 
-  // The create storage buffer will be accessed by both the CPU and GPU, so set
-  // the appropriate flags to indicate that the host device will be accessing
+  // Staging buffers are accessed by both the CPU and GPU, so set the
+  // appropriate flags to indicate that the host device will be accessing
   // the data from this buffer.
-  if (!gpu_only) {
-    // Deferred memory allocation should only be used for GPU only buffers.
-    VK_CHECK_COND(
-        allocate_memory,
-        "Only GPU-only buffers should use deferred memory allocation");
-
-    alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
-    alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
-    alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
-    alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-        VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
-  }
+  alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT |
+      VMA_ALLOCATION_CREATE_MAPPED_BIT;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
+  alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+  alloc_create_info.preferredFlags =
+      VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
 
-  return VulkanBuffer(
-      allocator_, size, alloc_create_info, buffer_usage, allocate_memory);
+  return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage);
 }
 
-VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) {
-  VmaAllocationCreateInfo alloc_create_info = {};
-  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
-  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
-
-  VkBufferUsageFlags buffer_usage =
-      VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+VulkanBuffer Allocator::create_storage_buffer(
+    const VkDeviceSize size,
+    const bool allocate_memory) {
+  const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
 
-  return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage);
+  VmaAllocationCreateInfo alloc_create_info = gpuonly_resource_create_info();
+  return VulkanBuffer(
+      allocator_, size, alloc_create_info, buffer_usage, allocate_memory);
 }
 
 VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) {
@@ -181,9 +175,7 @@ VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) {
 
   VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
 
-  VulkanBuffer uniform_buffer(
-      allocator_, size, alloc_create_info, buffer_usage);
-  return uniform_buffer;
+  return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage);
 }
 
 } // namespace vkapi
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h
index 904163cefb4..56385eb54d7 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.h
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.h
@@ -48,6 +48,8 @@ class Allocator final {
   VmaAllocator allocator_;
 
  public:
+  VmaAllocationCreateInfo gpuonly_resource_create_info();
+
   Allocation create_allocation(
       const VkMemoryRequirements& memory_requirements,
       const VmaAllocationCreateInfo& create_info);
@@ -62,13 +64,12 @@ class Allocator final {
       const bool allow_transfer = false,
       const bool allocate_memory = true);
 
+  VulkanBuffer create_staging_buffer(const VkDeviceSize);
+
   VulkanBuffer create_storage_buffer(
       const VkDeviceSize,
-      const bool gpu_only = true,
       const bool allocate_memory = true);
 
-  VulkanBuffer create_staging_buffer(const VkDeviceSize);
-
   /*
    * Create a uniform buffer with a specified size
    */
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
index 366b45a5e41..2af3d9efe31 100644
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
@@ -58,8 +58,6 @@ VulkanBuffer::VulkanBuffer(
       nullptr, // pQueueFamilyIndices
   };
 
-  memory_.create_info = allocation_create_info;
-
   if (allocate_memory) {
     VK_CHECK(vmaCreateBuffer(
         allocator_,
@@ -83,7 +81,7 @@ VulkanBuffer::VulkanBuffer(
     : buffer_properties_(other.buffer_properties_),
       allocator_(other.allocator_),
       memory_(other.memory_),
-      owns_memory_(other.owns_memory_),
+      owns_memory_(false),
       is_copy_(true),
       handle_(other.handle_) {
   // TODO: set the offset and range appropriately
@@ -137,6 +135,12 @@ VulkanBuffer::~VulkanBuffer() {
   }
 }
 
+VmaAllocationInfo VulkanBuffer::allocation_info() const {
+  VmaAllocationInfo info;
+  vmaGetAllocationInfo(allocator_, memory_.allocation, &info);
+  return info;
+}
+
 VkMemoryRequirements VulkanBuffer::get_memory_requirements() const {
   VkMemoryRequirements memory_requirements;
   vkGetBufferMemoryRequirements(this->device(), handle_, &memory_requirements);
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h
index 9302048f861..6197a02d402 100644
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.h
+++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h
@@ -114,9 +114,7 @@ class VulkanBuffer final {
     return memory_.allocation;
   }
 
-  inline VmaAllocationCreateInfo allocation_create_info() const {
-    return VmaAllocationCreateInfo(memory_.create_info);
-  }
+  VmaAllocationInfo allocation_info() const;
 
   inline VkBuffer handle() const {
     return handle_;
diff --git a/backends/vulkan/runtime/vk_api/memory/Image.cpp b/backends/vulkan/runtime/vk_api/memory/Image.cpp
index 42352cfb7e7..5029d166166 100644
--- a/backends/vulkan/runtime/vk_api/memory/Image.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Image.cpp
@@ -98,6 +98,7 @@ VulkanImage::VulkanImage()
       allocator_(VK_NULL_HANDLE),
       memory_{},
       owns_memory_(false),
+      is_copy_(false),
       handles_{
           VK_NULL_HANDLE,
           VK_NULL_HANDLE,
@@ -120,6 +121,7 @@ VulkanImage::VulkanImage(
       allocator_(vma_allocator),
       memory_{},
       owns_memory_{allocate_memory},
+      is_copy_(false),
       handles_{
           VK_NULL_HANDLE,
           VK_NULL_HANDLE,
@@ -157,8 +159,6 @@ VulkanImage::VulkanImage(
       layout_, // initialLayout
   };
 
-  memory_.create_info = allocation_create_info;
-
   if (allocate_memory) {
     VK_CHECK(vmaCreateImage(
         allocator_,
@@ -175,6 +175,17 @@ VulkanImage::VulkanImage(
   }
 }
 
+VulkanImage::VulkanImage(const VulkanImage& other) noexcept
+    : image_properties_(other.image_properties_),
+      view_properties_(other.view_properties_),
+      sampler_properties_(other.sampler_properties_),
+      allocator_(other.allocator_),
+      memory_(other.memory_),
+      owns_memory_{false},
+      is_copy_(true),
+      handles_(other.handles_),
+      layout_(other.layout_) {}
+
 VulkanImage::VulkanImage(VulkanImage&& other) noexcept
     : image_properties_(other.image_properties_),
       view_properties_(other.view_properties_),
@@ -182,6 +193,7 @@ VulkanImage::VulkanImage(VulkanImage&& other) noexcept
       allocator_(other.allocator_),
       memory_(std::move(other.memory_)),
       owns_memory_(other.owns_memory_),
+      is_copy_(other.is_copy_),
       handles_(other.handles_),
       layout_(other.layout_) {
   other.handles_.image = VK_NULL_HANDLE;
@@ -201,6 +213,7 @@ VulkanImage& VulkanImage::operator=(VulkanImage&& other) noexcept {
   allocator_ = other.allocator_;
   memory_ = std::move(other.memory_);
   owns_memory_ = other.owns_memory_;
+  is_copy_ = other.is_copy_;
   handles_ = other.handles_;
   layout_ = other.layout_;
 
@@ -212,6 +225,13 @@ VulkanImage& VulkanImage::operator=(VulkanImage&& other) noexcept {
 }
 
 VulkanImage::~VulkanImage() {
+  // Do not destroy any resources if this class instance is a copy of another
+  // class instance, since this means that this class instance does not have
+  // ownership of the underlying resource.
+  if (is_copy_) {
+    return;
+  }
+
   if (VK_NULL_HANDLE != handles_.image_view) {
     vkDestroyImageView(this->device(), handles_.image_view, nullptr);
   }
diff --git a/backends/vulkan/runtime/vk_api/memory/Image.h b/backends/vulkan/runtime/vk_api/memory/Image.h
index 1e78f84a5c5..447e980595f 100644
--- a/backends/vulkan/runtime/vk_api/memory/Image.h
+++ b/backends/vulkan/runtime/vk_api/memory/Image.h
@@ -22,6 +22,12 @@
 #include <unordered_map>
 
 namespace vkcompute {
+
+// Forward declare vTensor classes such that they can be set as friend classes
+namespace api {
+class vTensorStorage;
+} // namespace api
+
 namespace vkapi {
 
 class ImageSampler final {
@@ -96,7 +102,23 @@ class VulkanImage final {
       VkSampler,
       const bool allocate_memory = true);
 
-  VulkanImage(const VulkanImage&) = delete;
+ protected:
+  /*
+   * The Copy constructor allows for creation of a class instance that are
+   * "aliases" of another class instance. The resulting class instance will not
+   * have ownership of the underlying VkImage.
+   *
+   * This behaviour is analogous to creating a copy of a pointer, thus it is
+   * unsafe, as the original class instance may be destroyed before the copy.
+   * These constructors are therefore marked protected so that they may be used
+   * only in situations where the lifetime of the original class instance is
+   * guaranteed to exceed, or at least be the same as, the lifetime of the
+   * copied class instance.
+   */
+  VulkanImage(const VulkanImage& other) noexcept;
+
+ public:
+  // To discourage creating copies, the assignment operator is still deleted.
   VulkanImage& operator=(const VulkanImage&) = delete;
 
   VulkanImage(VulkanImage&&) noexcept;
@@ -123,6 +145,9 @@ class VulkanImage final {
   Allocation memory_;
   // Indicates whether the underlying memory is owned by this resource
   bool owns_memory_;
+  // Indicates whether this VulkanImage was copied from another VulkanImage,
+  // thus it does not have ownership of the underlying VKBuffer
+  bool is_copy_;
   Handles handles_;
   // Layout
   VkImageLayout layout_;
@@ -144,10 +169,6 @@ class VulkanImage final {
     return memory_.allocation;
   }
 
-  inline VmaAllocationCreateInfo allocation_create_info() const {
-    return VmaAllocationCreateInfo(memory_.create_info);
-  }
-
   inline VkFormat format() const {
     return image_properties_.image_format;
   }
@@ -193,10 +214,18 @@ class VulkanImage final {
     return owns_memory_;
   }
 
+  inline bool is_copy() const {
+    return is_copy_;
+  }
+
   inline operator bool() const {
     return (handles_.image != VK_NULL_HANDLE);
   }
 
+  inline bool is_copy_of(const VulkanImage& other) const {
+    return (handles_.image == other.handles_.image) && is_copy_;
+  }
+
   inline void bind_allocation(const Allocation& memory) {
     VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
     VK_CHECK(vmaBindImageMemory(allocator_, memory.allocation, handles_.image));
@@ -207,6 +236,8 @@ class VulkanImage final {
   }
 
   VkMemoryRequirements get_memory_requirements() const;
+
+  friend class api::vTensorStorage;
 };
 
 struct ImageMemoryBarrier final {
diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py
index da40f0a720b..20d09f1df5c 100644
--- a/backends/vulkan/serialization/vulkan_graph_builder.py
+++ b/backends/vulkan/serialization/vulkan_graph_builder.py
@@ -24,6 +24,9 @@
     Node, NoneType, _ScalarType, TensorSpec, List[_ScalarType], List[Node], str
 ]
 
+logger: logging.Logger = logging.getLogger("")
+logger.setLevel(logging.INFO)
+
 
 class VkGraphBuilder:
     def __init__(
@@ -351,9 +354,9 @@ def build_graph(self) -> vk_graph_schema.VkGraph:
             self.process_node(node, call_node_debug_hdl)
             call_node_debug_hdl += 1
 
-        logging.info("Operators included in this Vulkan partition: ")
+        logger.info("Operators included in this Vulkan partition: ")
         for op in self.seen_ops:
-            logging.info(f"    {op.__name__}")
+            logger.info(f"    {op.__name__}")
 
         return vk_graph_schema.VkGraph(
             version="0",
diff --git a/backends/vulkan/test/glsl/scalar_add_texture.glsl b/backends/vulkan/test/glsl/scalar_add_texture.glsl
new file mode 100644
index 00000000000..992907d0c25
--- /dev/null
+++ b/backends/vulkan/test/glsl/scalar_add_texture.glsl
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(0, "rw", "t_in", "float", "texture3d")}
+${layout_declare_ubo(1, "ivec3", "extents")}
+${layout_declare_ubo(2, "int", "scalar")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThanEqual(pos, extents))) {
+    return;
+  }
+
+  vec4 in_tex = imageLoad(t_in, pos);
+  imageStore(t_in, pos, imageLoad(t_in, pos) + float(scalar));
+}
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index c5088ffdb32..9db5cc8a841 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -8,7 +8,7 @@
 from collections import namedtuple
 from typing import Callable
 
-from executorch.backends.vulkan.test.op_tests.utils.codegen import VkTestSuite
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import VkTestSuite
 
 
 # Prime numbers dim sizes for testing
@@ -49,6 +49,7 @@ def get_binary_elementwise_inputs():
             ((S, S1, S2), (S, S1, S2)),
             ((S, S1, S2), (S, S1, 1), 2.0),
             ((S, S1, S2), (S, 1, S2), 2.0),
+            ((XS, S, S1, S2), (XS, S, 1, 1), 2.0),
         ]
     )
     test_suite.layouts = [
@@ -465,8 +466,8 @@ def get_view_inputs():
     return test_suite
 
 
-@register_test_suite(["aten.slice.Tensor", "aten.slice_copy.Tensor"])
-def get_slice_inputs():
+@register_test_suite("aten.slice_copy.Tensor")
+def get_slice_out_inputs():
     Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"])
     Test.__new__.__defaults__ = (None, 0, None, None, 1)
 
@@ -548,6 +549,39 @@ def get_slice_inputs():
     return test_suite
 
 
+def get_slice_view_inputs():
+    Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"])
+    Test.__new__.__defaults__ = (None, 0, None, None, 1)
+
+    # Slice by channel
+    test_cases = [
+        Test(self=[1, 17, 1, 10], dim=1, start=0, end=4),
+        Test(self=[1, 17, 1, 10], dim=1, start=0, end=8),
+        Test(self=[1, 17, 3, 7], dim=1, start=0, end=12),
+    ]
+
+    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
+
+    test_suite.dtypes = ["at::kFloat"]
+    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
+    test_suite.layouts = ["utils::kWidthPacked"]
+    test_suite.data_gen = "make_seq_tensor"
+    test_suite.is_view_op = True
+
+    return test_suite
+
+
+@register_test_suite(["aten.slice.Tensor"])
+def get_slice_inputs():
+    texture_test_suite = get_slice_out_inputs()
+    texture_test_suite.test_name_suffix = "no_view"
+
+    view_test_suite = get_slice_view_inputs()
+    view_test_suite.test_name_suffix = "view"
+
+    return [view_test_suite, texture_test_suite]
+
+
 @register_test_suite(["aten.transpose.int"])
 def get_transpose_inputs():
     Test = namedtuple("VkTransposeViewTest", ["self", "dim0", "dim1"])
@@ -558,7 +592,6 @@ def get_transpose_inputs():
         Test(self=[M1, S2, M], dim0=0, dim1=1),
         Test(self=[M1, S2, M], dim0=0, dim1=2),
         Test(self=[M1, S2, M], dim0=2, dim1=1),
-        Test(self=[S, M, S2, M2], dim0=0, dim1=2),
         Test(self=[S, M, S2, M2], dim0=3, dim1=2),
         Test(self=[S, M, S2, M2], dim0=1, dim1=2),
         Test(self=[S, M, S2, M2], dim0=3, dim1=1),
@@ -567,7 +600,7 @@ def get_transpose_inputs():
     test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
 
     test_suite.dtypes = ["at::kFloat"]
-    test_suite.storage_types = ["utils::kBuffer"]
+    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
     test_suite.layouts = ["utils::kWidthPacked", "utils::kChannelsPacked"]
     test_suite.data_gen = "make_seq_tensor"
     test_suite.is_view_op = True
@@ -904,6 +937,7 @@ def get_softmax_inputs():
         "aten.neg.default",
         "aten.cos.default",
         "aten.hardswish.default",
+        "aten.hardsigmoid.default",
     ]
 )
 def get_unary_ops_inputs():
diff --git a/backends/vulkan/test/op_tests/generate_op_benchmarks.py b/backends/vulkan/test/op_tests/generate_op_benchmarks.py
new file mode 100644
index 00000000000..7f286123df9
--- /dev/null
+++ b/backends/vulkan/test/op_tests/generate_op_benchmarks.py
@@ -0,0 +1,88 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+from typing import Dict
+
+from executorch.backends.vulkan.test.op_tests.cases import test_suites
+
+from executorch.backends.vulkan.test.op_tests.utils.gen_benchmark_vk import (
+    VkBenchmarkFileGen,
+)
+from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import (
+    ComputeGraphGen,
+)
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
+from torchgen import local
+
+from torchgen.gen import parse_native_yaml, ParsedYaml
+from torchgen.model import DispatchKey, NativeFunction
+
+
+def registry_name(f: NativeFunction) -> str:
+    name = str(f.namespace) + "." + str(f.func.name)
+    if len(f.func.name.overload_name) == 0:
+        name += ".default"
+    return name
+
+
+def construct_f_map(parsed_yaml: ParsedYaml) -> Dict[str, NativeFunction]:
+    f_map: Dict[str, NativeFunction] = {}
+    for f in parsed_yaml.native_functions:
+        f_map[registry_name(f)] = f
+    return f_map
+
+
+def process_test_suites(
+    cpp_generator: VkBenchmarkFileGen,
+    f_map: Dict[str, NativeFunction],
+    test_suites: Dict[str, TestSuite],
+) -> None:
+    for registry_name, op_test_suites in test_suites.items():
+        f = f_map[registry_name]
+        if isinstance(op_test_suites, list):
+            for suite in op_test_suites:
+                cpp_generator.add_suite(registry_name, f, suite)
+        else:
+            cpp_generator.add_suite(registry_name, f, op_test_suites)
+
+
+@local.parametrize(
+    use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+)
+def generate_cpp(
+    native_functions_yaml_path: str, tags_path: str, output_dir: str
+) -> None:
+    output_file = os.path.join(output_dir, "op_benchmarks.cpp")
+    cpp_generator = VkBenchmarkFileGen(output_file)
+
+    parsed_yaml = parse_native_yaml(native_functions_yaml_path, tags_path)
+    f_map = construct_f_map(parsed_yaml)
+
+    ComputeGraphGen.backend_key = parsed_yaml.backend_indices[DispatchKey.CPU]
+
+    process_test_suites(cpp_generator, f_map, test_suites)
+
+    with open(output_file, "w") as file:
+        file.write(cpp_generator.generate_cpp())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--aten-yaml-path",
+        help="path to native_functions.yaml file.",
+    )
+    parser.add_argument(
+        "--tags-path",
+        help="Path to tags.yaml. Required by yaml parsing in gen_correctness_vk system.",
+    )
+
+    parser.add_argument("-o", "--output", help="Output directory", required=True)
+    args = parser.parse_args()
+    generate_cpp(args.aten_yaml_path, args.tags_path, args.output)
diff --git a/backends/vulkan/test/op_tests/generate_op_tests.py b/backends/vulkan/test/op_tests/generate_op_correctness_tests.py
similarity index 68%
rename from backends/vulkan/test/op_tests/generate_op_tests.py
rename to backends/vulkan/test/op_tests/generate_op_correctness_tests.py
index 71047ac6f49..4e51e23940b 100644
--- a/backends/vulkan/test/op_tests/generate_op_tests.py
+++ b/backends/vulkan/test/op_tests/generate_op_correctness_tests.py
@@ -10,12 +10,14 @@
 from typing import Dict
 
 from executorch.backends.vulkan.test.op_tests.cases import test_suites
+from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import (
+    ComputeGraphGen,
+)
 
-from executorch.backends.vulkan.test.op_tests.utils.codegen import VkCppTestFileGen
-from executorch.backends.vulkan.test.op_tests.utils.codegen_base import (
-    TestSuite,
-    TestSuiteGen,
+from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_vk import (
+    VkCorrectnessTestFileGen,
 )
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
 from torchgen import local
 
 from torchgen.gen import parse_native_yaml, ParsedYaml
@@ -37,13 +39,17 @@ def construct_f_map(parsed_yaml: ParsedYaml) -> Dict[str, NativeFunction]:
 
 
 def process_test_suites(
-    cpp_generator: VkCppTestFileGen,
+    cpp_generator: VkCorrectnessTestFileGen,
     f_map: Dict[str, NativeFunction],
     test_suites: Dict[str, TestSuite],
 ) -> None:
-    for registry_name, op_test_suite in test_suites.items():
+    for registry_name, op_test_suites in test_suites.items():
         f = f_map[registry_name]
-        cpp_generator.add_suite(registry_name, f, op_test_suite)
+        if isinstance(op_test_suites, list):
+            for suite in op_test_suites:
+                cpp_generator.add_suite(registry_name, f, suite)
+        else:
+            cpp_generator.add_suite(registry_name, f, op_test_suites)
 
 
 @local.parametrize(
@@ -53,12 +59,12 @@ def generate_cpp(
     native_functions_yaml_path: str, tags_path: str, output_dir: str
 ) -> None:
     output_file = os.path.join(output_dir, "op_tests.cpp")
-    cpp_generator = VkCppTestFileGen(output_file)
+    cpp_generator = VkCorrectnessTestFileGen(output_file)
 
     parsed_yaml = parse_native_yaml(native_functions_yaml_path, tags_path)
     f_map = construct_f_map(parsed_yaml)
 
-    TestSuiteGen.backend_key = parsed_yaml.backend_indices[DispatchKey.CPU]
+    ComputeGraphGen.backend_key = parsed_yaml.backend_indices[DispatchKey.CPU]
 
     process_test_suites(cpp_generator, f_map, test_suites)
 
@@ -67,16 +73,14 @@ def generate_cpp(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Generate a simple Hello World C++ program."
-    )
+    parser = argparse.ArgumentParser()
     parser.add_argument(
         "--aten-yaml-path",
         help="path to native_functions.yaml file.",
     )
     parser.add_argument(
         "--tags-path",
-        help="Path to tags.yaml. Required by yaml parsing in codegen system.",
+        help="Path to tags.yaml. Required by yaml parsing in gen_correctness_vk system.",
     )
     parser.add_argument("-o", "--output", help="Output directory", required=True)
     args = parser.parse_args()
diff --git a/backends/vulkan/test/op_tests/targets.bzl b/backends/vulkan/test/op_tests/targets.bzl
index 0cffb5d80be..9b6ea61de21 100644
--- a/backends/vulkan/test/op_tests/targets.bzl
+++ b/backends/vulkan/test/op_tests/targets.bzl
@@ -8,9 +8,22 @@ def define_common_targets(is_fbcode = False):
         return
 
     runtime.python_library(
-        name = "generate_op_tests_lib",
+        name = "generate_op_correctness_tests_lib",
         srcs = native.glob(["utils/*.py"]) + [
-            "generate_op_tests.py",
+            "generate_op_correctness_tests.py",
+            "cases.py",
+        ],
+        base_module = "executorch.backends.vulkan.test.op_tests",
+        deps = [
+            "fbsource//third-party/pypi/expecttest:expecttest",
+        ],
+        external_deps = ["torchgen"],
+    )
+
+    runtime.python_library(
+        name = "generate_op_benchmarks_lib",
+        srcs = native.glob(["utils/*.py"]) + [
+            "generate_op_benchmarks.py",
             "cases.py",
         ],
         base_module = "executorch.backends.vulkan.test.op_tests",
@@ -21,23 +34,31 @@ def define_common_targets(is_fbcode = False):
     )
 
     runtime.python_binary(
-        name = "generate_op_tests",
-        main_module = "executorch.backends.vulkan.test.op_tests.generate_op_tests",
+        name = "generate_op_correctness_tests",
+        main_module = "executorch.backends.vulkan.test.op_tests.generate_op_correctness_tests",
         deps = [
-            ":generate_op_tests_lib",
+            ":generate_op_correctness_tests_lib",
+        ],
+    )
+
+    runtime.python_binary(
+        name = "generate_op_benchmarks",
+        main_module = "executorch.backends.vulkan.test.op_tests.generate_op_benchmarks",
+        deps = [
+            ":generate_op_benchmarks_lib",
         ],
     )
 
     aten_src_path = runtime.external_dep_location("aten-src-path")
     genrule_cmd = [
-        "$(exe :generate_op_tests)",
+        "$(exe :generate_op_correctness_tests)",
         "--tags-path $(location {})/aten/src/ATen/native/tags.yaml".format(aten_src_path),
         "--aten-yaml-path $(location {})/aten/src/ATen/native/native_functions.yaml".format(aten_src_path),
         "-o $OUT",
     ]
 
     runtime.genrule(
-        name = "generated_op_tests_cpp",
+        name = "generated_op_correctness_tests_cpp",
         outs = {
             "op_tests.cpp": ["op_tests.cpp"],
         },
@@ -45,6 +66,22 @@ def define_common_targets(is_fbcode = False):
         default_outs = ["."],
     )
 
+    benchmarks_genrule_cmd = [
+        "$(exe :generate_op_benchmarks)",
+        "--tags-path $(location {})/aten/src/ATen/native/tags.yaml".format(aten_src_path),
+        "--aten-yaml-path $(location {})/aten/src/ATen/native/native_functions.yaml".format(aten_src_path),
+        "-o $OUT",
+    ]
+
+    runtime.genrule(
+        name = "generated_op_benchmarks_cpp",
+        outs = {
+            "op_benchmarks.cpp": ["op_benchmarks.cpp"],
+        },
+        cmd = " ".join(benchmarks_genrule_cmd),
+        default_outs = ["."],
+    )
+
     pt_operator_library(
         name = "all_aten_ops",
         check_decl = False,
@@ -66,7 +103,7 @@ def define_common_targets(is_fbcode = False):
     runtime.cxx_binary(
         name = "compute_graph_op_tests_bin",
         srcs = [
-            ":generated_op_tests_cpp[op_tests.cpp]",
+            ":generated_op_correctness_tests_cpp[op_tests.cpp]",
         ],
         define_static_target = False,
         deps = [
@@ -76,10 +113,26 @@ def define_common_targets(is_fbcode = False):
         ],
     )
 
+    runtime.cxx_binary(
+        name = "compute_graph_op_benchmarks_bin",
+        srcs = [
+            ":generated_op_benchmarks_cpp[op_benchmarks.cpp]",
+        ],
+        compiler_flags = [
+            "-Wno-unused-variable",
+        ],
+        define_static_target = False,
+        deps = [
+            "//third-party/benchmark:benchmark",
+            "//executorch/backends/vulkan:vulkan_graph_runtime",
+            ":all_aten_ops_lib",
+        ],
+    )
+
     runtime.cxx_test(
         name = "compute_graph_op_tests",
         srcs = [
-            ":generated_op_tests_cpp[op_tests.cpp]",
+            ":generated_op_correctness_tests_cpp[op_tests.cpp]",
         ],
         contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"],
         fbandroid_additional_loaded_sonames = [
diff --git a/backends/vulkan/test/op_tests/utils/aten_types.py b/backends/vulkan/test/op_tests/utils/aten_types.py
new file mode 100644
index 00000000000..186f5afb78b
--- /dev/null
+++ b/backends/vulkan/test/op_tests/utils/aten_types.py
@@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+####################
+## ATen C++ Types ##
+####################
+
+AT_INT_ARRAY_REF = "at::IntArrayRef"
+AT_SCALAR = "at::Scalar"
+AT_TENSOR = "at::Tensor"
+AT_TENSOR_LIST = "at::TensorList"
+BOOL = "bool"
+DOUBLE = "double"
+INT = "int64_t"
+OPT_AT_DOUBLE_ARRAY_REF = "::std::optional<at::ArrayRef<double>>"
+OPT_AT_INT_ARRAY_REF = "at::OptionalIntArrayRef"
+OPT_AT_TENSOR = "::std::optional<at::Tensor>"
+OPT_BOOL = "::std::optional<bool>"
+OPT_INT64 = "::std::optional<int64_t>"
+OPT_DEVICE = "::std::optional<at::Device>"
+OPT_LAYOUT = "::std::optional<at::Layout>"
+OPT_MEMORY_FORMAT = "::std::optional<at::MemoryFormat>"
+OPT_SCALAR_TYPE = "::std::optional<at::ScalarType>"
+STRING = "c10::string_view"
+TWO_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor>"
+THREE_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor,at::Tensor>"
+TENSOR_VECTOR = "::std::vector<at::Tensor>"
diff --git a/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py b/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py
new file mode 100644
index 00000000000..fb42d982f67
--- /dev/null
+++ b/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py
@@ -0,0 +1,335 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import re
+
+from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import (
+    ComputeGraphGen,
+)
+from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_base import (
+    CorrectnessTestGen,
+)
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
+
+from torchgen.model import NativeFunction
+
+##########################
+## Test Suite Generation ##
+##########################
+
+benchmark_fixture_template = """
+class GeneratedOpBenchmark_{op_name} : public ::benchmark::Fixture {{
+ protected:
+  ComputeGraph* graph;
+  at::ScalarType test_dtype = at::kFloat;
+  float rtol = {rtol};
+  float atol = {atol};
+
+  {arg_valuerefs}
+
+  void SetUp(::benchmark::State& state) override {{
+    GraphConfig config;
+    config.descriptor_pool_safety_factor = 2.0;
+    test_dtype = at::ScalarType(state.range(0));
+    const utils::StorageType storage_type = utils::StorageType(state.range(1));
+    const utils::GPUMemoryLayout memory_layout = utils::GPUMemoryLayout(state.range(2));
+    config.set_storage_type_override(storage_type);
+    config.set_memory_layout_override(memory_layout);
+    config.enable_querypool = true;
+    graph = new ComputeGraph(config);
+  }}
+
+  void TearDown(::benchmark::State& state) override {{
+    delete graph;
+    graph = nullptr;
+  }}
+
+  {build_graph_fn}
+  {benchmark_fn}
+}};
+"""
+
+benchmark_template = """
+BENCHMARK_DEFINE_F(GeneratedOpBenchmark_{op_name}, {case_name})(benchmark::State& state) {{
+    {skips}
+    {create_ref_data}
+    {call_build_graph}
+    ShaderTimes shader_times;
+    for (auto _ : state) {{
+        {call_benchmark}
+        graph->context()->querypool().extract_results();
+        QueryPoolResults results = graph->context()->querypool().get_shader_timestamp_data();
+        process_querypool_results(results, shader_times);
+    }}
+    register_shader_time_counters(state, shader_times);
+}}
+
+BENCHMARK_REGISTER_F(GeneratedOpBenchmark_{op_name}, {case_name})->Threads(1)->ArgsProduct({combos});
+"""
+
+
+class VkBenchmarkGen(CorrectnessTestGen):
+    def __init__(self, op_reg_name: str, f: NativeFunction, inputs: TestSuite):
+        super().__init__(f, inputs)
+        self.op_reg_name = op_reg_name
+        self.generator = ComputeGraphGen(self.op_reg_name, self.f, self.suite_def)
+
+    def gen_call_benchmark(self, prepack=False) -> str:
+        test_str = f"benchmark_{self.op_name}("
+        if prepack:
+            test_str = f"prepacked_benchmark_{self.op_name}("
+        for binding in self.f_sig.arguments():
+            arg = binding.argument
+            test_str += f"{arg.name}, "
+        test_str = test_str[:-2] + ");"
+        test_str = re.sub(r"^", "  ", test_str, flags=re.M)
+        return test_str
+
+    def gen_call_build_graph(self, prepack=False) -> str:
+        test_str = f"build_graph_{self.op_name}("
+        if prepack:
+            test_str = f"prepacked_build_graph_{self.op_name}("
+        for binding in self.f_sig.arguments():
+            arg = binding.argument
+            test_str += f"{arg.name}, "
+        test_str = test_str[:-2] + ");"
+        test_str = re.sub(r"^", "  ", test_str, flags=re.M)
+        return test_str
+
+    def gen_combos(self, inputs) -> str:
+        dtypes_list = ", ".join(f"int({dtype})" for dtype in self.suite_def.dtypes)
+        storage_types_list = ", ".join(
+            f"int({storage_type})" for storage_type in self.suite_def.storage_types
+        )
+        layouts_list = ", ".join(f"int({layout})" for layout in self.suite_def.layouts)
+        return f"{{ {{ {dtypes_list} }}, {{ {storage_types_list} }}, {{ {layouts_list} }} }}"
+
+    def generate_benchmark_case(self, inputs, prepack=False) -> str:
+        return benchmark_template.format(
+            op_name=f"{self.op_name}",
+            case_name=self.gen_case_name(inputs, prepack),
+            skips=self.generator.gen_conditional_skips(
+                'state.SkipWithError("unsupported type"); return;'
+            ),
+            create_ref_data=self.gen_create_ref_data(inputs),
+            call_build_graph=self.gen_call_build_graph(prepack),
+            call_benchmark=self.gen_call_benchmark(prepack),
+            combos=self.gen_combos(inputs),
+        )
+
+    def generate_benchmark(self) -> str:
+        benchmarks_cpp = ""
+        for inputs in self.suite_def.input_cases:
+            if not self.suite_def.requires_prepack:
+                benchmarks_cpp += self.generate_benchmark_case(inputs)
+            if self.suite_def.supports_prepack():
+                benchmarks_cpp += self.generate_benchmark_case(inputs, prepack=True)
+        return benchmarks_cpp
+
+    def generate_benchmark_fixture(self) -> str:
+        build_graph_fn = ""
+        benchmark_fn = ""
+        if not self.suite_def.requires_prepack:
+            build_graph_fn = self.generator.gen_build_graph_fn()
+            benchmark_fn = self.generator.gen_op_exec_graph_fn()
+
+        prepacked_build_graph_fn = ""
+        prepacked_benchmark_fn = ""
+        if self.suite_def.supports_prepack():
+            self.generator.should_prepack = True
+            prepacked_build_graph_fn = self.generator.gen_build_graph_fn()
+            build_graph_fn += "\n\n  "
+            build_graph_fn += prepacked_build_graph_fn
+            prepacked_benchmark_fn = self.generator.gen_op_exec_graph_fn()
+            benchmark_fn += "\n\n  "
+            benchmark_fn += prepacked_benchmark_fn
+
+        return benchmark_fixture_template.format(
+            op_name=self.op_name,
+            build_graph_fn=build_graph_fn,
+            benchmark_fn=benchmark_fn,
+            rtol=self.suite_def.rtol,
+            arg_valuerefs=self.generator.gen_arg_valueref_decls(),
+            atol=self.suite_def.atol,
+        )
+
+
+##########################
+## Test File Generation ##
+##########################
+
+cpp_test_template = """
+#include <iostream>
+#include <ATen/ATen.h>
+#include <benchmark/benchmark.h>
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+using namespace vkcompute;
+using TensorOptions = at::TensorOptions;
+
+vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {{
+  switch (at_scalartype) {{
+    case c10::kFloat:
+      return vkapi::kFloat;
+    case c10::kHalf:
+      return vkapi::kHalf;
+    case c10::kInt:
+      return vkapi::kInt;
+    case c10::kLong:
+      return vkapi::kInt;
+    case c10::kChar:
+      return vkapi::kChar;
+    default:
+      VK_THROW("Unsupported at::ScalarType!");
+  }}
+}}
+
+at::Tensor make_rand_tensor(
+    std::vector<int64_t> sizes,
+    at::ScalarType dtype = at::kFloat,
+    float low = 0.0,
+    float high = 1.0) {{
+  if (high == 1.0 && low == 0.0)
+    return at::rand(sizes, at::device(at::kCPU).dtype(dtype));
+    
+  if (dtype == at::kChar)
+    return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype));
+
+  return at::rand(sizes, at::device(at::kCPU).dtype(dtype)) * (high - low) + low;
+}}
+
+at::Tensor make_seq_tensor(
+    std::vector<int64_t> sizes,
+    at::ScalarType dtype = at::kFloat,
+    float low = 0.0,
+    float high = 1.0) {{
+  (void)low;
+  (void)high;
+
+  int64_t n = 1;
+  for (auto size: sizes) {{
+    n *= size;
+  }}
+
+  std::vector<float> values(n);
+  for (int i=0;i<n;i++) {{
+    values[i] = (float) i;
+  }}
+
+  // Clone as original data will be deallocated upon return.
+  return at::from_blob(values.data(), sizes, at::kFloat).toType(dtype).detach().clone();
+}}
+
+at::Tensor make_index_tensor(std::vector<int64_t> indices) {{
+  at::ScalarType dtype = at::kInt;
+  std::vector<int64_t> sizes = {{static_cast<int64_t>(indices.size())}};
+
+  // Clone as original data will be deallocated upon return.
+  return at::from_blob(indices.data(), sizes, dtype).detach().clone();
+}}
+
+at::Tensor make_index_tensor(std::vector<std::vector<int64_t>> indices) {{
+  at::ScalarType dtype = at::kInt;
+  std::vector<int64_t> sizes = {{
+    static_cast<int64_t>(indices.size()),
+    static_cast<int64_t>(indices[0].size())}};
+
+  // Flatten indices as from_blob reads garbage otherwise.
+  std::vector<int64_t> acc;
+  for (auto& vec: indices) {{
+    acc.insert(acc.end(), vec.begin(), vec.end());
+  }}
+
+  // Clone as original data will be deallocated upon return.
+  return at::from_blob(acc.data(), sizes, dtype).detach().clone();
+}}
+
+at::Tensor make_index_tensor(std::vector<std::vector<std::vector<int64_t>>> indices) {{
+  at::ScalarType dtype = at::kInt;
+  std::vector<int64_t> sizes = {{
+    static_cast<int64_t>(indices.size()),
+    static_cast<int64_t>(indices[0].size()),
+    static_cast<int64_t>(indices[0][0].size())}};
+
+  // Flatten indices as from_blob reads garbage otherwise.
+  std::vector<int64_t> acc;
+  for (auto& v: indices) {{
+    for (auto& vv: v) {{
+      acc.insert(acc.end(), vv.begin(), vv.end());
+    }}
+  }}
+
+  // Clone as original data will be deallocated upon return.
+  return at::from_blob(acc.data(), sizes, dtype).detach().clone();
+}}
+
+using ShaderEntry = std::tuple<std::string, uint32_t, uint64_t, uint64_t>;
+using QueryPoolResults = std::vector<ShaderEntry>;
+using ShaderTimes = std::unordered_map<std::string, std::vector<uint64_t>>;
+
+void process_querypool_results(
+    QueryPoolResults& results,
+    ShaderTimes& shader_times) {{
+  for (const ShaderEntry& entry : results) {{
+    std::string kernel_name = std::get<0>(entry);
+    std::uint64_t start_ns = std::get<2>(entry);
+    std::uint64_t end_ns = std::get<3>(entry);
+    std::uint64_t duration_ns = end_ns - start_ns;
+    if (shader_times.find(kernel_name) == shader_times.end()) {{
+      shader_times[kernel_name] = std::vector<uint64_t>();
+    }}
+    shader_times[kernel_name].emplace_back(duration_ns);
+  }}
+}}
+
+void register_shader_time_counters(
+    benchmark::State& state,
+    ShaderTimes& shader_times) {{
+  for (auto& times_list : shader_times) {{
+    // Filter to_nchw and nchw_to shaders
+    if (times_list.first.find("to_nchw") != std::string::npos) {{
+        continue;
+    }}
+    if (times_list.first.find("nchw_to") != std::string::npos) {{
+        continue;
+    }}
+
+    std::sort(times_list.second.begin(), times_list.second.end());
+    uint64_t median_time;
+    median_time = times_list.second[times_list.second.size() / 2];
+    state.counters[times_list.first + " median ns"] = median_time;
+  }}
+}}
+
+{benchmark_fixtures}
+
+{def_benchmarks}
+"""
+
+
+class VkBenchmarkFileGen:
+    def __init__(self, out_path):
+        self.out_path = out_path
+        self.suites_gens = []
+
+    def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None:
+        suites_gen = VkBenchmarkGen(op_reg_name, f, all_input_cases)
+        self.suites_gens.append(suites_gen)
+
+    def generate_benchmarks_cpp(self) -> str:
+        return "\n".join([h.generate_benchmark() for h in self.suites_gens])
+
+    def generate_benchmark_fixtures(self) -> str:
+        return "\n".join([h.generate_benchmark_fixture() for h in self.suites_gens])
+
+    def generate_cpp(self) -> str:
+        return cpp_test_template.format(
+            benchmark_fixtures=self.generate_benchmark_fixtures(),
+            def_benchmarks=self.generate_benchmarks_cpp(),
+        )
diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py
similarity index 77%
rename from backends/vulkan/test/op_tests/utils/codegen.py
rename to backends/vulkan/test/op_tests/utils/gen_computegraph.py
index b39801e7660..f6ee9c78a14 100644
--- a/backends/vulkan/test/op_tests/utils/codegen.py
+++ b/backends/vulkan/test/op_tests/utils/gen_computegraph.py
@@ -6,15 +6,14 @@
 
 import re
 from dataclasses import dataclass
-from typing import Any, List, Optional, Union
+from typing import List, Optional, Union
 
-from executorch.backends.vulkan.test.op_tests.utils.codegen_base import (
+from executorch.backends.vulkan.test.op_tests.utils.aten_types import (
     AT_INT_ARRAY_REF,
     AT_SCALAR,
     AT_TENSOR,
     AT_TENSOR_LIST,
     BOOL,
-    CppTestFileGen,
     DOUBLE,
     INT,
     OPT_AT_DOUBLE_ARRAY_REF,
@@ -28,37 +27,20 @@
     OPT_SCALAR_TYPE,
     STRING,
     TENSOR_VECTOR,
-    TestSuite,
-    TestSuiteGen,
     THREE_TENSOR_TUPLE,
     TWO_TENSOR_TUPLE,
 )
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
 
 from torchgen.api import cpp
 from torchgen.api.types import CppSignatureGroup
-
 from torchgen.gen import generate_static_dispatch_backend_call, translate_args
-
 from torchgen.gen_aoti_c_shim import gen_static_dispatch_backend_call_signature
 from torchgen.model import NativeFunction, Variant
 
-##################################
-## Custom Test Suite Definition ##
-##################################
-
-
-@dataclass
-class VkTestSuite(TestSuite):
-    def __init__(self, input_cases: List[Any]):
-        super().__init__(input_cases)
-        self.storage_types: List[str] = ["utils::kTexture3D"]
-        self.layouts: List[str] = ["utils::kChannelsPacked"]
-        self.data_gen: str = "make_rand_tensor"
-
-
-##########################
-## Code Generator Class ##
-##########################
+###################################
+## Compute Graph Code Generation ##
+###################################
 
 
 @dataclass
@@ -105,6 +87,8 @@ def vk_out(self):
 
 
 class ComputeGraphGen:
+    backend_key = None
+
     def __init__(self, op_reg_name: str, f: NativeFunction, suite_def: TestSuite):
         self.op_reg_name = op_reg_name
         self.f = f
@@ -230,7 +214,7 @@ def gen_decl(self, fn_name: str, ret_type: str = "void") -> str:
 
     def create_aten_fn_call(self) -> str:
         func_call = generate_static_dispatch_backend_call(
-            self.f_sig, self.f, TestSuiteGen.backend_key
+            self.f_sig, self.f, ComputeGraphGen.backend_key
         )[7:].replace("::cpu", "")
 
         return func_call
@@ -244,11 +228,12 @@ def create_aten_method_call(self) -> str:
         func_call = f"ATEN_FN({self.f_sig.name()})({exprs});"
         return func_call
 
-    def create_out_src(self) -> str:
+    def create_out_src(self, include_declarations: bool = True) -> str:
+        cpp_type = self.out.cpp_type if include_declarations else ""
         if Variant.function in self.f.variants:
-            return f"{self.out.cpp_type} out = " + self.create_aten_fn_call() + "\n"
+            return f"{cpp_type} out = " + self.create_aten_fn_call() + "\n"
         else:
-            return f"{self.out.cpp_type} out = " + self.create_aten_method_call() + "\n"
+            return f"{cpp_type} out = " + self.create_aten_method_call() + "\n"
 
     ## Graph code generation utils
 
@@ -258,7 +243,28 @@ def prepack_ref(self, ref: ValueRef) -> bool:
         else:
             return ref.supports_prepack and self.should_prepack
 
-    def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
+    def create_value_decl_for(self, ref: ValueRefList) -> str:  # noqa: C901
+        if isinstance(ref, list):
+            ret_str = ""
+            for r in ref:
+                ret_str += self.create_value_decl_for(r)
+            return ret_str
+
+        cpp_type = "IOValueRef" if (ref.is_in or ref.requires_prepack) else "ValueRef"
+        if ref.src_cpp_type == AT_TENSOR_LIST:
+            ret_str = f"std::vector<IOValueRef> {ref.name}_io_value_refs;\n"
+            ret_str += f"std::vector<ValueRef> {ref.name}_value_refs;\n"
+            return ret_str
+        elif ref.src_cpp_type == TENSOR_VECTOR:
+            ret_str = f"std::vector<IOValueRef> {ref.io_value_list_name};\n"
+            ret_str += f"std::vector<ValueRef> {ref.value_list_name};\n"
+            return ret_str
+        else:
+            return f"{cpp_type} {ref.name};\n"
+
+    def create_value_for(  # noqa: C901
+        self, ref: ValueRefList, include_declarations: bool = True
+    ) -> str:
         if isinstance(ref, list):
             ret_str = ""
             for r in ref:
@@ -269,9 +275,16 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
         ref_is_view = self.suite_def.is_view_op and ref.is_out
 
         cpp_type = "IOValueRef" if (ref.is_in and not prepack) else "ValueRef"
+        if not include_declarations:
+            cpp_type = ""
 
         if ref.src_cpp_type == OPT_AT_TENSOR:
             ret_str = f"{cpp_type} {ref.name} = "
+            if prepack:
+                ret_str = ""
+                if include_declarations:
+                    ret_str += f"IOValueRef {ref.name};\n"
+                ret_str += f"{ref.name}.value = "
             ret_str += f"!{ref.src_cpp_name}.has_value() ? "
             ret_str += f"{self.graph}{self.dot}add_none() : "
             if not prepack:
@@ -308,11 +321,13 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
             # each tensor, to facilate staging. On the other hand, we will
             # use the .value tensor to create a ValueList, which will be passed
             # to the corresponding ops.
-            ret_str = f"std::vector<IOValueRef> {ref.name}_io_value_refs;\n"
-            ret_str += f"std::vector<ValueRef> {ref.name}_value_refs;\n"
+            ret_str = ""
+            if include_declarations:
+                ret_str += f"std::vector<IOValueRef> {ref.name}_io_value_refs;\n"
+                ret_str += f"std::vector<ValueRef> {ref.name}_value_refs;\n"
             ret_str += f"for (int i=0; i < {ref.src_cpp_name}.size(); i++) {{\n"
             ret_str += (
-                f"  {cpp_type} io_value_ref = {self.graph}{self.dot}add_input_tensor(\n"
+                f"  IOValueRef io_value_ref = {self.graph}{self.dot}add_input_tensor(\n"
             )
             ret_str += f"      {ref.src_cpp_name}[i].sizes().vec(),\n"
             ret_str += (
@@ -324,9 +339,11 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
             ret_str += f"ValueRef {ref.name} = {self.graph}{self.dot}add_value_list(std::move({ref.name}_value_refs));\n"
             return ret_str
         elif ref.src_cpp_type == TENSOR_VECTOR:
-            ret_str = f"""
-std::vector<IOValueRef> {ref.io_value_list_name};
-std::vector<ValueRef> {ref.value_list_name};
+            ret_str = ""
+            if include_declarations:
+                ret_str += f"std::vector<IOValueRef> {ref.io_value_list_name};\n"
+                ret_str += f"std::vector<ValueRef> {ref.value_list_name};\n"
+            ret_str += f"""
 for (int i=0; i<out.size(); i++) {{
     const at::Tensor& cur = out[i];
     IOValueRef io_value_ref;
@@ -340,6 +357,12 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
             return ret_str
 
         ret_str = f"{cpp_type} {ref.name} = {self.graph}{self.dot}"
+        if prepack:
+            ret_str = ""
+            if include_declarations:
+                ret_str = f"IOValueRef {ref.name};\n"
+            ret_str += f"{ref.name}.value = {self.graph}{self.dot}"
+
         if ref.src_cpp_type == AT_TENSOR and ref_is_view:
             input_name = None
             for _name, ref in self.refs.items():
@@ -347,8 +370,7 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
                     input_name = ref.name
 
             assert input_name is not None
-            ret_str += "add_tensor_view(" + input_name + ".value);"
-            pass
+            ret_str += f"add_tensor_view({input_name}.value);"
         elif ref.src_cpp_type == AT_TENSOR and not prepack:
             ret_str += "add_input_tensor(" if ref.is_in else "add_tensor("
             ret_str += f"{ref.src_cpp_name}.sizes().vec(), "
@@ -400,14 +422,29 @@ def create_op_call(self) -> str:
             else:
                 op_create_code += (
                     f"{ref.name}.value, "
-                    if (ref.is_in and not self.prepack_ref(ref)) or ref.is_out
+                    if ref.is_in or ref.requires_prepack or ref.is_out
                     else f"{ref.name}, "
                 )
+                # op_create_code += f"{ref.name}, "
 
         op_create_code += "out_ref});\n"
         return op_create_code
 
-    def set_output(self, ref: ValueRefList) -> str:
+    def gen_output_staging_valueref_decl(self, ref: ValueRefList) -> str:
+        if isinstance(ref, list):
+            ret_str = ""
+            for r in ref[:-1]:
+                ret_str += self.gen_output_staging_valueref_decl(r)
+            return ret_str
+        elif ref.src_cpp_type == TENSOR_VECTOR:
+            assert ref.is_out
+            ret_str = ""
+            return ret_str
+
+        assert ref.src_cpp_type == AT_TENSOR and ref.is_out
+        return f"ValueRef {ref.name}_staging;\n"
+
+    def set_output(self, ref: ValueRefList, include_declarations: bool = True) -> str:
         if isinstance(ref, list):
             ret_str = ""
             for r in ref[:-1]:
@@ -424,7 +461,8 @@ def set_output(self, ref: ValueRefList) -> str:
             return ret_str
 
         assert ref.src_cpp_type == AT_TENSOR and ref.is_out
-        ret_str = f"ValueRef {ref.name}_staging = {self.graph}{self.dot}"
+        cpptype = "ValueRef" if include_declarations else ""
+        ret_str = f"{cpptype} {ref.name}_staging = {self.graph}{self.dot}"
         ret_str += f"set_output_tensor({ref.name});\n"
         return ret_str
 
@@ -542,15 +580,28 @@ def check_graph_out(self, ref: ValueRefList) -> str:
 
     ## Top level code generation
 
-    def gen_graph_build_code(self) -> str:
-        graph_build = self.create_out_src()
+    def gen_arg_valueref_decls(self) -> str:
+        ret_str = ""
+        for aten_arg in self.args:
+            ref = self.refs[aten_arg.name]
+            ret_str += self.create_value_decl_for(ref)
+
+        ret_str += self.create_value_decl_for(self.refs["out"])
+        ret_str += f"{self.out.cpp_type} out;\n"
+        ret_str += self.gen_output_staging_valueref_decl(self.refs["out"])
+        return ret_str
+
+    def gen_graph_build_code(self, include_declarations: bool = True) -> str:
+        graph_build = self.create_out_src(include_declarations)
         for aten_arg in self.args:
-            graph_build += self.create_value_for(self.refs[aten_arg.name])
+            graph_build += self.create_value_for(
+                self.refs[aten_arg.name], include_declarations
+            )
 
-        graph_build += self.create_value_for(self.refs["out"])
+        graph_build += self.create_value_for(self.refs["out"], include_declarations)
         graph_build += self.create_op_call()
 
-        graph_build += self.set_output(self.refs["out"])
+        graph_build += self.set_output(self.refs["out"], include_declarations)
 
         graph_build += f"{self.graph}{self.dot}prepare();\n"
         graph_build += f"{self.graph}{self.dot}encode_prepack();\n"
@@ -560,7 +611,7 @@ def gen_graph_build_code(self) -> str:
         graph_build += "\n"
         return graph_build
 
-    def gen_graph_exec_code(self) -> str:
+    def gen_graph_exec_code(self, check_output=True) -> str:
         graph_exec = ""
         for aten_arg in self.args:
             ref = self.refs[aten_arg.name]
@@ -573,26 +624,27 @@ def gen_graph_exec_code(self) -> str:
 
         graph_exec += self.declare_vk_out_for(self.refs["out"])
         graph_exec += self.copy_from_staging(self.refs["out"])
-        graph_exec += self.check_graph_out(self.refs["out"])
+        if check_output:
+            graph_exec += self.check_graph_out(self.refs["out"])
 
         graph_exec = re.sub(r"^", "  ", graph_exec, flags=re.M)
         graph_exec = "{\n" + graph_exec + "\n}"
 
         return graph_exec
 
-    def gen_conditional_skips(self) -> str:
+    def gen_conditional_skips(self, skip_str: str = "GTEST_SKIP();") -> str:
         fp16_skip = f"if (!{self.graph}{self.dot}context()->adapter_ptr()->has_full_float16_buffers_support()) {{\n"
-        fp16_skip += "  GTEST_SKIP();\n"
+        fp16_skip += f"  {skip_str}\n"
         fp16_skip += "}"
         fp16_skip = re.sub(r"^", "  ", fp16_skip, flags=re.M) + "\n"
 
         int8_skip = f"if (!{self.graph}{self.dot}context()->adapter_ptr()->has_full_int8_buffers_support()) {{\n"
-        int8_skip += "  GTEST_SKIP();\n"
+        int8_skip += f"  {skip_str};\n"
         int8_skip += "}\n"
 
         skips = ""
 
-        skips = "if (test_dtype == at::kHalf) {\n"
+        skips += "if (test_dtype == at::kHalf) {\n"
         skips += fp16_skip
         skips += "}\n"
 
@@ -606,6 +658,9 @@ def gen_conditional_skips(self) -> str:
 
     def gen_op_check_fn(self) -> str:
         op_name = self.f.func.name.unambiguous_name()
+        if self.suite_def.test_name_suffix is not None:
+            op_name += "_" + self.suite_def.test_name_suffix
+
         op_check_fn = self.gen_decl(f"check_{op_name}") + " {\n"
         if self.should_prepack:
             op_check_fn = self.gen_decl(f"prepacked_check_{op_name}") + " {\n"
@@ -622,146 +677,36 @@ def gen_op_check_fn(self) -> str:
 
         return op_check_fn
 
+    def gen_build_graph_fn(self, include_declarations: bool = False) -> str:
+        op_name = self.f.func.name.unambiguous_name()
+        if self.suite_def.test_name_suffix is not None:
+            op_name += "_" + self.suite_def.test_name_suffix
+        op_build_graph_fn = self.gen_decl(f"build_graph_{op_name}") + " {\n"
+        if self.should_prepack:
+            op_build_graph_fn = (
+                self.gen_decl(f"prepacked_build_graph_{op_name}") + " {\n"
+            )
 
-##################################
-## Test Fixture Code Generation ##
-##################################
-
-test_fixture_template = """
-class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple<at::ScalarType, utils::StorageType, utils::GPUMemoryLayout>> {{
- protected:
-  ComputeGraph* graph;
-  at::ScalarType test_dtype = at::kFloat;
-  float rtol = {rtol};
-  float atol = {atol};
-
-  void SetUp() override {{
-    GraphConfig config;
-    utils::StorageType default_storage_type;
-    utils::GPUMemoryLayout default_memory_layout;
-    std::tie(test_dtype, default_storage_type, default_memory_layout) = GetParam();
-    config.set_storage_type_override(default_storage_type);
-    config.set_memory_layout_override(default_memory_layout);
-    graph = new ComputeGraph(config);
-
-    if (test_dtype == at::kHalf) {{
-      rtol = 1e-2;
-      atol = 1e-2;
-    }}
-  }}
-
-  void TearDown() override {{
-    delete graph;
-    graph = nullptr;
-  }}
-
-  {check_fn}
-}};
-"""
-
-
-class VkTestSuiteGen(TestSuiteGen):
-    def __init__(self, op_reg_name: str, f: NativeFunction, inputs: VkTestSuite):
-        super().__init__(f, inputs)
-        self.op_reg_name = op_reg_name
-        self.generator = ComputeGraphGen(self.op_reg_name, self.f, self.suite_def)
-
-    def generate_fixture_cpp(self) -> str:
-        check_fn = ""
-        if not self.suite_def.requires_prepack:
-            check_fn = self.generator.gen_op_check_fn()
-
-        prepacked_check_fn = ""
-        if self.suite_def.supports_prepack():
-            self.generator.should_prepack = True
-            prepacked_check_fn = self.generator.gen_op_check_fn()
-            check_fn += "\n\n  "
-            check_fn += prepacked_check_fn
-
-        return test_fixture_template.format(
-            op_name=self.op_name,
-            check_fn=check_fn,
-            rtol=self.suite_def.rtol,
-            atol=self.suite_def.atol,
-        )
+        op_build_graph_fn_body = ""
+        op_build_graph_fn_body += self.gen_graph_build_code(include_declarations)
 
-    def gen_parameterization(self) -> str:
-        dtypes = self.suite_def.dtypes
-        storage_types = self.suite_def.storage_types
-        layouts = self.suite_def.layouts
-
-        return f"""
-INSTANTIATE_TEST_SUITE_P(
-  Combos_{self.op_name},
-  GeneratedOpsTest_{self.op_name},
-    ::testing::Combine(
-      ::testing::Values({', '.join(dtypes)}),
-      ::testing::Values({', '.join(storage_types)}),
-      ::testing::Values({', '.join(layouts)})));
-        """
-
-
-##############################
-## Test File Code Generation ##
-###############################
-
-preamble_str = """
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <tuple>
-
-using namespace vkcompute;
-using TensorOptions = at::TensorOptions;
-
-vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {
-  switch (at_scalartype) {
-    case c10::kFloat:
-      return vkapi::kFloat;
-    case c10::kHalf:
-      return vkapi::kHalf;
-    case c10::kInt:
-      return vkapi::kInt;
-    case c10::kLong:
-      return vkapi::kInt;
-    case c10::kChar:
-      return vkapi::kChar;
-    default:
-      VK_THROW("Unsupported at::ScalarType!");
-  }
-}
-
-#ifdef USE_VULKAN_FP16_INFERENCE
-bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-2, float atol=1e-2) {
-#else
-bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-5, float atol=1e-5) {
-#endif
-  // Skip checking index tensors
-  if (t1.scalar_type() == at::kLong || t2.scalar_type() == at::kLong) {
-    return true;
-  }
-  bool is_close = at::allclose(t1, t2, rtol, atol);
-  if (!is_close && t1.numel() < 500) {
-    std::cout << "reference: " << std::endl;
-    print(t1, 150);
-    std::cout << std::endl;
-    std::cout << "vulkan: " << std::endl;
-    print(t2, 150);
-    std::cout << std::endl;
-  }
-  return is_close;
-}
-"""
+        op_build_graph_fn += op_build_graph_fn_body
+        op_build_graph_fn += "\n  }"
+        return op_build_graph_fn
 
+    def gen_op_exec_graph_fn(self) -> str:
+        op_name = self.f.func.name.unambiguous_name()
+        if self.suite_def.test_name_suffix is not None:
+            op_name += "_" + self.suite_def.test_name_suffix
+        op_benchmark_fn = self.gen_decl(f"benchmark_{op_name}") + " {\n"
+        if self.should_prepack:
+            op_benchmark_fn = self.gen_decl(f"prepacked_benchmark_{op_name}") + " {\n"
 
-class VkCppTestFileGen(CppTestFileGen):
-    def __init__(self, out_path: str):
-        super().__init__(out_path)
+        op_benchmark_fn_body = ""
+        op_benchmark_fn_body += self.gen_graph_exec_code(False)
 
-    def generate_preamble(self) -> str:
-        return preamble_str
+        op_benchmark_fn_body = re.sub(r"^", "    ", op_benchmark_fn_body, flags=re.M)
 
-    def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None:
-        suites_gen = VkTestSuiteGen(op_reg_name, f, all_input_cases)
-        self.suites_gens.append(suites_gen)
+        op_benchmark_fn += op_benchmark_fn_body
+        op_benchmark_fn += "\n  }"
+        return op_benchmark_fn
diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py
similarity index 87%
rename from backends/vulkan/test/op_tests/utils/codegen_base.py
rename to backends/vulkan/test/op_tests/utils/gen_correctness_base.py
index 1ebebe699a0..def3508a8a7 100644
--- a/backends/vulkan/test/op_tests/utils/codegen_base.py
+++ b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py
@@ -7,62 +7,31 @@
 import re
 from typing import Any, List
 
+from executorch.backends.vulkan.test.op_tests.utils.aten_types import (
+    AT_INT_ARRAY_REF,
+    AT_SCALAR,
+    AT_TENSOR,
+    AT_TENSOR_LIST,
+    BOOL,
+    DOUBLE,
+    INT,
+    OPT_AT_DOUBLE_ARRAY_REF,
+    OPT_AT_INT_ARRAY_REF,
+    OPT_AT_TENSOR,
+    OPT_BOOL,
+    OPT_DEVICE,
+    OPT_INT64,
+    OPT_LAYOUT,
+    OPT_MEMORY_FORMAT,
+    OPT_SCALAR_TYPE,
+    STRING,
+)
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
+
 from torchgen.api import cpp
 from torchgen.api.types import CppSignatureGroup
 from torchgen.model import Argument, NativeFunction
 
-########################
-## ATen code patterns ##
-########################
-
-AT_INT_ARRAY_REF = "at::IntArrayRef"
-AT_SCALAR = "at::Scalar"
-AT_TENSOR = "at::Tensor"
-AT_TENSOR_LIST = "at::TensorList"
-BOOL = "bool"
-DOUBLE = "double"
-INT = "int64_t"
-OPT_AT_DOUBLE_ARRAY_REF = "::std::optional<at::ArrayRef<double>>"
-OPT_AT_INT_ARRAY_REF = "at::OptionalIntArrayRef"
-OPT_AT_TENSOR = "::std::optional<at::Tensor>"
-OPT_BOOL = "::std::optional<bool>"
-OPT_INT64 = "::std::optional<int64_t>"
-OPT_DEVICE = "::std::optional<at::Device>"
-OPT_LAYOUT = "::std::optional<at::Layout>"
-OPT_MEMORY_FORMAT = "::std::optional<at::MemoryFormat>"
-OPT_SCALAR_TYPE = "::std::optional<at::ScalarType>"
-STRING = "c10::string_view"
-TWO_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor>"
-THREE_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor,at::Tensor>"
-TENSOR_VECTOR = "::std::vector<at::Tensor>"
-
-###########################
-## Test Suite definition ##
-###########################
-
-
-class TestSuite:
-    def __init__(self, input_cases: List[Any]):
-        self.input_cases: List[Any] = input_cases
-        self.prepacked_args: List[str] = []
-        self.requires_prepack: bool = False
-        self.dtypes: List[str] = ["at::kFloat", "at::kHalf"]
-
-        self.data_gen: str = "make_rand_tensor"
-        self.data_range = (0, 1)
-
-        self.arg_dtype = {}
-        self.arg_data_range = {}
-
-        self.atol: str = "1e-5"
-        self.rtol: str = "1e-5"
-
-        self.is_view_op: bool = False
-
-    def supports_prepack(self):
-        return len(self.prepacked_args) > 0
-
-
 ##########################
 ## Test Suite Generation ##
 ##########################
@@ -105,13 +74,13 @@ def get_or_return_default(arg: Argument, inputs: List[Any], i: int):
         return arg.default
 
 
-class TestSuiteGen:
-    backend_key = None
-
+class CorrectnessTestGen:
     def __init__(self, f: NativeFunction, test_suite: TestSuite):
         self.f = f
         self.suite_def = test_suite
         self.op_name = f.func.name.unambiguous_name()
+        if test_suite.test_name_suffix is not None:
+            self.op_name += f"_{test_suite.test_name_suffix}"
 
         self.f_sig = CppSignatureGroup.from_native_function(
             self.f, method=False, fallback_binding=self.f.manual_cpp_binding
@@ -379,7 +348,7 @@ def generate_suite_cpp(self) -> str:
 """
 
 
-class CppTestFileGen:
+class CorrectnessTestFileGen:
     def __init__(self, out_path):
         self.out_path = out_path
         self.suites_gens = []
@@ -397,5 +366,5 @@ def generate_test_suites_cpp(self) -> str:
         return "\n".join([h.generate_suite_cpp() for h in self.suites_gens])
 
     def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None:
-        suites_gen = TestSuiteGen(f, all_input_cases)
+        suites_gen = CorrectnessTestGen(f, all_input_cases)
         self.suites_gens.append(suites_gen)
diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py
new file mode 100644
index 00000000000..6c165a777db
--- /dev/null
+++ b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py
@@ -0,0 +1,159 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import (
+    ComputeGraphGen,
+)
+from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_base import (
+    CorrectnessTestFileGen,
+    CorrectnessTestGen,
+)
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import VkTestSuite
+
+from torchgen.model import NativeFunction
+
+##################################
+## Test Fixture Code Generation ##
+##################################
+
+test_fixture_template = """
+class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple<at::ScalarType, utils::StorageType, utils::GPUMemoryLayout>> {{
+ protected:
+  ComputeGraph* graph;
+  at::ScalarType test_dtype = at::kFloat;
+  float rtol = {rtol};
+  float atol = {atol};
+
+  void SetUp() override {{
+    GraphConfig config;
+    utils::StorageType default_storage_type;
+    utils::GPUMemoryLayout default_memory_layout;
+    std::tie(test_dtype, default_storage_type, default_memory_layout) = GetParam();
+    config.set_storage_type_override(default_storage_type);
+    config.set_memory_layout_override(default_memory_layout);
+    graph = new ComputeGraph(config);
+
+    if (test_dtype == at::kHalf) {{
+      rtol = 1e-2;
+      atol = 1e-2;
+    }}
+  }}
+
+  void TearDown() override {{
+    delete graph;
+    graph = nullptr;
+  }}
+
+  {check_fn}
+}};
+"""
+
+
+class VkCorrectnessTestGen(CorrectnessTestGen):
+    def __init__(self, op_reg_name: str, f: NativeFunction, inputs: VkTestSuite):
+        super().__init__(f, inputs)
+        self.op_reg_name = op_reg_name
+        self.generator = ComputeGraphGen(self.op_reg_name, self.f, self.suite_def)
+
+    def generate_fixture_cpp(self) -> str:
+        check_fn = ""
+        if not self.suite_def.requires_prepack:
+            check_fn = self.generator.gen_op_check_fn()
+
+        prepacked_check_fn = ""
+        if self.suite_def.supports_prepack():
+            self.generator.should_prepack = True
+            prepacked_check_fn = self.generator.gen_op_check_fn()
+            check_fn += "\n\n  "
+            check_fn += prepacked_check_fn
+
+        return test_fixture_template.format(
+            op_name=self.op_name,
+            check_fn=check_fn,
+            rtol=self.suite_def.rtol,
+            atol=self.suite_def.atol,
+        )
+
+    def gen_parameterization(self) -> str:
+        dtypes = self.suite_def.dtypes
+        storage_types = self.suite_def.storage_types
+        layouts = self.suite_def.layouts
+
+        return f"""
+INSTANTIATE_TEST_SUITE_P(
+  Combos_{self.op_name},
+  GeneratedOpsTest_{self.op_name},
+    ::testing::Combine(
+      ::testing::Values({', '.join(dtypes)}),
+      ::testing::Values({', '.join(storage_types)}),
+      ::testing::Values({', '.join(layouts)})));
+        """
+
+
+##############################
+## Test File Code Generation ##
+###############################
+
+preamble_str = """
+#include <executorch/backends/vulkan/runtime/api/api.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+#include <tuple>
+
+using namespace vkcompute;
+using TensorOptions = at::TensorOptions;
+
+vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {
+  switch (at_scalartype) {
+    case c10::kFloat:
+      return vkapi::kFloat;
+    case c10::kHalf:
+      return vkapi::kHalf;
+    case c10::kInt:
+      return vkapi::kInt;
+    case c10::kLong:
+      return vkapi::kInt;
+    case c10::kChar:
+      return vkapi::kChar;
+    default:
+      VK_THROW("Unsupported at::ScalarType!");
+  }
+}
+
+#ifdef USE_VULKAN_FP16_INFERENCE
+bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-2, float atol=1e-2) {
+#else
+bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-5, float atol=1e-5) {
+#endif
+  // Skip checking index tensors
+  if (t1.scalar_type() == at::kLong || t2.scalar_type() == at::kLong) {
+    return true;
+  }
+  bool is_close = at::allclose(t1, t2, rtol, atol);
+  if (!is_close && t1.numel() < 500) {
+    std::cout << "reference: " << std::endl;
+    print(t1, 150);
+    std::cout << std::endl;
+    std::cout << "vulkan: " << std::endl;
+    print(t2, 150);
+    std::cout << std::endl;
+  }
+  return is_close;
+}
+"""
+
+
+class VkCorrectnessTestFileGen(CorrectnessTestFileGen):
+    def __init__(self, out_path: str):
+        super().__init__(out_path)
+
+    def generate_preamble(self) -> str:
+        return preamble_str
+
+    def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None:
+        suites_gen = VkCorrectnessTestGen(op_reg_name, f, all_input_cases)
+        self.suites_gens.append(suites_gen)
diff --git a/backends/vulkan/test/op_tests/utils/test_suite.py b/backends/vulkan/test/op_tests/utils/test_suite.py
new file mode 100644
index 00000000000..dd01bdde3a4
--- /dev/null
+++ b/backends/vulkan/test/op_tests/utils/test_suite.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import Any, List, Optional
+
+###################################
+## Generic Test Suite definition ##
+###################################
+
+
+class TestSuite:
+    def __init__(self, input_cases: List[Any]):
+        self.input_cases: List[Any] = input_cases
+        self.prepacked_args: List[str] = []
+        self.requires_prepack: bool = False
+        self.dtypes: List[str] = ["at::kFloat", "at::kHalf"]
+
+        self.data_gen: str = "make_rand_tensor"
+        self.data_range = (0, 1)
+
+        self.arg_dtype = {}
+        self.arg_data_range = {}
+
+        self.atol: str = "1e-5"
+        self.rtol: str = "1e-5"
+
+        self.is_view_op: bool = False
+        self.test_name_suffix: Optional[str] = None
+
+    def supports_prepack(self):
+        return len(self.prepacked_args) > 0
+
+
+##################################
+## Vulkan Test Suite Definition ##
+##################################
+
+
+@dataclass
+class VkTestSuite(TestSuite):
+    def __init__(self, input_cases: List[Any]):
+        super().__init__(input_cases)
+        self.storage_types: List[str] = ["utils::kTexture3D"]
+        self.layouts: List[str] = ["utils::kChannelsPacked"]
+        self.data_gen: str = "make_rand_tensor"
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index 9f57ec49a89..e6ddf1cdb86 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -204,6 +204,16 @@ def forward(self, x, y, w):
 
         self.lower_module_and_test_output(add_module, sample_inputs)
 
+        sample_inputs = (
+            torch.rand(size=(4, 5, 2, 3), dtype=torch.float32),
+            torch.rand(size=(4, 5, 2, 3), dtype=torch.float32),
+            torch.rand(
+                size=(2, 3), dtype=torch.float32
+            ),  # test broadcasting on packed dim
+        )
+
+        self.lower_module_and_test_output(add_module, sample_inputs)
+
     def test_vulkan_backend_add_int(self):
         class AddIntModule(torch.nn.Module):
             def __init__(self):
@@ -1633,6 +1643,42 @@ def forward(self, x):
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
 
+    def test_vulkan_backend_conv_with_clamp(self):
+        class ConvWithClampModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = torch.randn(6, 8, 3, 3)
+                self.bias = torch.randn(8)
+                self.stride = (1, 2)
+                self.padding = (2, 3)
+                self.dilation = (1, 1)
+                self.transposed = True
+                self.output_padding = (0, 1)
+                self.groups = 1
+                self.output_min = 0
+                self.output_max = 10
+
+            def forward(self, x):
+                return torch.ops.et_vk.conv_with_clamp(
+                    x,
+                    self.weight,
+                    self.bias,
+                    self.stride,
+                    self.padding,
+                    self.dilation,
+                    self.transposed,
+                    self.output_padding,
+                    self.groups,
+                    self.output_min,
+                    self.output_max,
+                )
+
+        self.lower_module_and_test_output(
+            ConvWithClampModule(),
+            (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),),
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
     def test_vulkan_backend_grid_priors(self):
         class GridPriorsModule(torch.nn.Module):
             def __init__(self):
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 6c056cc9d90..86e9cfc5d57 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -8,13 +8,15 @@
 
 #include <executorch/backends/vulkan/test/utils/test_utils.h>
 
-#include <executorch/runtime/core/portable_type/half.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 
 #include <cassert>
 #include <random>
 
+using namespace vkcompute;
+
 //
 // Operator Recording Functions
 //
@@ -68,15 +70,14 @@ void record_nchw_to_image_op(
     vkapi::VulkanBuffer& src_buffer,
     api::vTensor& v_dst) {
   vkapi::PipelineBarrier pipeline_barrier{};
-  vkapi::SpecVarList specialization_constants = {
-      SV(v_dst.packed_dim_whcn_idx())};
+  vkapi::SpecVarList specialization_constants = {SV(v_dst.packed_dim())};
 
   context->submit_compute_job(
       get_nchw_to_tensor_shader(
           v_dst, context->adapter_ptr()->has_full_int8_buffers_support()),
       pipeline_barrier,
-      v_dst.image_extents(),
-      adaptive_work_group_size(v_dst.image_extents()),
+      v_dst.logical_limits(),
+      adaptive_work_group_size(v_dst.logical_limits()),
       specialization_constants,
       VK_NULL_HANDLE,
       0,
@@ -85,7 +86,8 @@ void record_nchw_to_image_op(
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::WRITE),
       src_buffer,
-      v_dst.sizes_ubo());
+      v_dst.sizes_ubo(),
+      v_dst.axis_map_ubo());
 }
 
 void record_image_to_nchw_op(
@@ -93,26 +95,26 @@ void record_image_to_nchw_op(
     api::vTensor& v_src,
     vkapi::VulkanBuffer& dst_buffer) {
   vkapi::PipelineBarrier pipeline_barrier{};
-  vkapi::SpecVarList specialization_constants = {
-      SV(v_src.packed_dim_whcn_idx())};
+  vkapi::SpecVarList specialization_constants = {SV(v_src.packed_dim())};
 
   context->submit_compute_job(
       get_tensor_to_nchw_shader(v_src),
       pipeline_barrier,
-      v_src.image_extents(),
-      adaptive_work_group_size(v_src.image_extents()),
+      v_src.logical_limits(),
+      adaptive_work_group_size(v_src.logical_limits()),
       specialization_constants,
       VK_NULL_HANDLE,
       0,
       dst_buffer,
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      v_src.sizes_ubo());
+      v_src.sizes_ubo(),
+      v_src.axis_map_ubo());
 }
 
 void record_int8_image_to_nchw_noint8_op(
     api::Context* const context,
     api::vTensor& v_src,
-    api::StorageBuffer& dst_buffer) {
+    api::StagingBuffer& dst_buffer) {
   vkapi::PipelineBarrier pipeline_barrier{};
   uint32_t buffer_len = utils::safe_downcast<uint32_t>(dst_buffer.numel() / 4);
   utils::uvec3 global_wg_size = {buffer_len, 1, 1};
@@ -121,12 +123,13 @@ void record_int8_image_to_nchw_noint8_op(
       pipeline_barrier,
       global_wg_size,
       adaptive_work_group_size(global_wg_size),
-      {v_src.packed_dim_whcn_idx()},
+      {v_src.packed_dim()},
       VK_NULL_HANDLE,
       0,
       dst_buffer.buffer(),
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo(),
+      v_src.axis_map_ubo(),
       v_src.numel_ubo());
 }
 
@@ -155,8 +158,8 @@ void record_conv2d_prepack_weights_op(
   context->submit_compute_job(
       shader,
       pipeline_barrier,
-      v_dst.image_extents(),
-      adaptive_work_group_size(v_dst.image_extents()),
+      v_dst.logical_limits(),
+      adaptive_work_group_size(v_dst.logical_limits()),
       specialization_constants,
       VK_NULL_HANDLE,
       0,
@@ -183,8 +186,8 @@ void record_binary_op(
   context->submit_compute_job(
       VK_KERNEL_FROM_STR(kernel_name),
       pipeline_barrier,
-      v_dst.image_extents(),
-      adaptive_work_group_size(v_dst.image_extents()),
+      v_dst.logical_limits(),
+      adaptive_work_group_size(v_dst.logical_limits()),
       specialization_constants,
       VK_NULL_HANDLE,
       0,
@@ -311,6 +314,42 @@ void record_reference_matmul(
       mat2.strides_ubo());
 }
 
+void record_matmul_texture3d(
+    api::Context* context,
+    api::vTensor& out,
+    api::vTensor& mat1,
+    api::vTensor& mat2) {
+  std::string kernel_name = "matmul_naive";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, out.storage_type());
+  add_dtype_suffix(kernel_name, out.dtype());
+
+  utils::uvec3 global_wg_size = out.logical_limits();
+
+  vkapi::PipelineBarrier pipeline_barrier{};
+  api::context()->submit_compute_job(
+      VK_KERNEL_FROM_STR(kernel_name),
+      pipeline_barrier,
+      global_wg_size,
+      {8, 8, 1},
+      {out.packed_dim(), mat1.packed_dim(), mat2.packed_dim()},
+      VK_NULL_HANDLE,
+      0,
+      out.image(
+          pipeline_barrier,
+          vkapi::PipelineStage::COMPUTE,
+          vkapi::MemoryAccessType::WRITE),
+      mat1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
+      mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
+      out.sizes_ubo(),
+      out.logical_limits_ubo(),
+      out.axis_map_ubo(),
+      mat1.sizes_ubo(),
+      mat1.axis_map_ubo(),
+      mat2.sizes_ubo(),
+      mat2.axis_map_ubo());
+}
+
 //
 // Input & Output Utilities
 //
@@ -319,22 +358,22 @@ void record_reference_matmul(
   _(uint8_t, Byte)                \
   _(int8_t, Char)                 \
   _(int32_t, Int)                 \
-  _(torch::executor::Half, Half)  \
+  _(exec_aten::Half, Half)        \
   _(float, Float)                 \
   _(int8_t, QInt8)
 
 void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
-  api::StorageBuffer staging_buffer(api::context(), vten.dtype(), data.size());
-
-#define CASE(ctype, name)                                                     \
-  case vkapi::ScalarType::name: {                                             \
-    std::vector<ctype> data_converted;                                        \
-    data_converted.resize(data.size());                                       \
-    for (int i = 0; i < data.size(); ++i) {                                   \
-      data_converted[i] = ctype(data[i]);                                     \
-    }                                                                         \
-    copy_ptr_to_staging(                                                      \
-        data_converted.data(), staging_buffer, vten.staging_buffer_nbytes()); \
+  api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size());
+
+#define CASE(ctype, name)                                     \
+  case vkapi::ScalarType::name: {                             \
+    std::vector<ctype> data_converted;                        \
+    data_converted.resize(data.size());                       \
+    for (int i = 0; i < data.size(); ++i) {                   \
+      data_converted[i] = ctype(data[i]);                     \
+    }                                                         \
+    staging_buffer.copy_from(                                 \
+        data_converted.data(), vten.staging_buffer_nbytes()); \
   } break;
 
   switch (vten.dtype()) {
@@ -377,6 +416,20 @@ std::vector<float> create_random_float_buffer(
   return data;
 }
 
+std::vector<uint8_t> create_random_uint8_buffer(
+    const size_t numel,
+    const uint8_t min,
+    const uint8_t max) {
+  std::vector<uint8_t> data(numel);
+  std::default_random_engine rng;
+  std::uniform_real_distribution<float> dist(min, max);
+
+  for (size_t i = 0; i < data.size(); ++i) {
+    data[i] = (uint8_t)dist(rng);
+  }
+  return data;
+}
+
 void fill_vtensor(
     ComputeGraph& graph,
     const IOValueRef idx,
@@ -397,7 +450,7 @@ void fill_vtensor(
 }
 
 void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
-  api::StorageBuffer staging_buffer(
+  api::StagingBuffer staging_buffer(
       api::context(), vten.dtype(), vten.staging_buffer_numel());
 
   if (vten.storage_type() == utils::StorageType::BUFFER) {
@@ -410,14 +463,14 @@ void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
   api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
   fence.wait();
 
-#define CASE(ctype, name)                                                     \
-  case vkapi::ScalarType::name: {                                             \
-    std::vector<ctype> data_converted(data.size());                           \
-    copy_staging_to_ptr(                                                      \
-        staging_buffer, data_converted.data(), vten.staging_buffer_nbytes()); \
-    for (int i = 0; i < data.size(); ++i) {                                   \
-      data[i] = float(data_converted[i]);                                     \
-    }                                                                         \
+#define CASE(ctype, name)                                     \
+  case vkapi::ScalarType::name: {                             \
+    std::vector<ctype> data_converted(data.size());           \
+    staging_buffer.copy_to(                                   \
+        data_converted.data(), vten.staging_buffer_nbytes()); \
+    for (int i = 0; i < data.size(); ++i) {                   \
+      data[i] = float(data_converted[i]);                     \
+    }                                                         \
   } break;
 
   switch (vten.dtype()) {
@@ -440,8 +493,10 @@ void submit_to_gpu() {
 }
 
 vkapi::Allocation allocate_memory_for(const api::vTensor& vten) {
+  VmaAllocationCreateInfo alloc_create_info =
+      api::context()->adapter_ptr()->vma().gpuonly_resource_create_info();
   return api::context()->adapter_ptr()->vma().create_allocation(
-      vten.get_memory_requirements(), vten.get_allocation_create_info());
+      vten.get_memory_requirements(), alloc_create_info);
 }
 
 VmaTotalStatistics get_vma_stats() {
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index bf549446170..d9d83a9620f 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -16,11 +16,9 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 
-using namespace vkcompute;
-
 #define CREATE_FLOAT_TEXTURE(sizes, allocate_memory)  \
-  api::vTensor(                                       \
-      api::context(),                                 \
+  vkcompute::api::vTensor(                            \
+      vkcompute::api::context(),                      \
       sizes,                                          \
       vkapi::kFloat,                                  \
       utils::StorageType::TEXTURE_3D,                 \
@@ -28,25 +26,29 @@ using namespace vkcompute;
       allocate_memory);
 
 #define CREATE_FLOAT_BUFFER(sizes, allocate_memory) \
-  api::vTensor(                                     \
-      api::context(),                               \
+  vkcompute::api::vTensor(                          \
+      vkcompute::api::context(),                    \
       sizes,                                        \
       vkapi::kFloat,                                \
       utils::StorageType::BUFFER,                   \
       utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED,  \
       allocate_memory);
 
-#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor)          \
-  api::StorageBuffer staging_buffer_##tensor(                        \
-      api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \
-  record_nchw_to_image_op(                                           \
-      api::context(), staging_buffer_##tensor.buffer(), tensor);
-
-#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor)        \
-  api::StorageBuffer staging_buffer_##tensor(                        \
-      api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \
-  record_image_to_nchw_op(                                           \
-      api::context(), tensor, staging_buffer_##tensor.buffer());
+#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \
+  vkcompute::api::StagingBuffer staging_buffer_##tensor(    \
+      vkcompute::api::context(),                            \
+      vkapi::kFloat,                                        \
+      tensor.staging_buffer_numel());                       \
+  record_nchw_to_image_op(                                  \
+      vkcompute::api::context(), staging_buffer_##tensor.buffer(), tensor);
+
+#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \
+  vkcompute::api::StagingBuffer staging_buffer_##tensor(      \
+      vkcompute::api::context(),                              \
+      vkapi::kFloat,                                          \
+      tensor.staging_buffer_numel());                         \
+  record_image_to_nchw_op(                                    \
+      vkcompute::api::context(), tensor, staging_buffer_##tensor.buffer());
 
 #define CHECK_VALUE(data, idx, expected)                          \
   do {                                                            \
@@ -63,108 +65,125 @@ using namespace vkcompute;
 //
 
 void record_nchw_to_buffer_op(
-    api::Context* const context,
-    vkapi::VulkanBuffer& src_buffer,
-    api::vTensor& v_dst);
+    vkcompute::api::Context* const context,
+    vkcompute::vkapi::VulkanBuffer& src_buffer,
+    vkcompute::api::vTensor& v_dst);
 
 void record_buffer_to_nchw_op(
-    api::Context* const context,
-    api::vTensor& v_src,
-    vkapi::VulkanBuffer& dst_buffer);
+    vkcompute::api::Context* const context,
+    vkcompute::api::vTensor& v_src,
+    vkcompute::vkapi::VulkanBuffer& dst_buffer);
 
 void record_nchw_to_image_op(
-    api::Context* const context,
-    vkapi::VulkanBuffer& src_buffer,
-    api::vTensor& v_dst);
+    vkcompute::api::Context* const context,
+    vkcompute::vkapi::VulkanBuffer& src_buffer,
+    vkcompute::api::vTensor& v_dst);
 
 void record_image_to_nchw_op(
-    api::Context* const context,
-    api::vTensor& v_src,
-    vkapi::VulkanBuffer& dst_buffer);
+    vkcompute::api::Context* const context,
+    vkcompute::api::vTensor& v_src,
+    vkcompute::vkapi::VulkanBuffer& dst_buffer);
 
 void record_int8_image_to_nchw_noint8_op(
-    api::Context* const context,
-    api::vTensor& v_src,
-    api::StorageBuffer& dst_buffer);
+    vkcompute::api::Context* const context,
+    vkcompute::api::vTensor& v_src,
+    vkcompute::api::StagingBuffer& dst_buffer);
 
 void record_conv2d_prepack_weights_op(
-    api::Context* const context,
-    vkapi::VulkanBuffer& src_buffer,
-    api::vTensor& v_dst,
+    vkcompute::api::Context* const context,
+    vkcompute::vkapi::VulkanBuffer& src_buffer,
+    vkcompute::api::vTensor& v_dst,
     const std::vector<int64_t>& original_sizes,
     const bool transposed);
 
 void record_binary_op(
-    api::Context* const context,
+    vkcompute::api::Context* const context,
     const std::string& op_name,
-    api::vTensor& v_in1,
-    api::vTensor& v_in2,
-    api::vTensor& v_dst);
+    vkcompute::api::vTensor& v_in1,
+    vkcompute::api::vTensor& v_in2,
+    vkcompute::api::vTensor& v_dst);
 
 void execute_and_check_add(
-    api::vTensor& a,
-    api::vTensor& b,
-    api::vTensor& c,
+    vkcompute::api::vTensor& a,
+    vkcompute::api::vTensor& b,
+    vkcompute::api::vTensor& c,
     float a_val,
     float b_val);
 
-void record_index_fill_buffer(api::Context* const context, api::vTensor& v_ten);
+void record_index_fill_buffer(
+    vkcompute::api::Context* const context,
+    vkcompute::api::vTensor& v_ten);
 
 void record_scalar_add_buffer(
-    api::Context* context,
-    api::vTensor& v_ten,
+    vkcompute::api::Context* context,
+    vkcompute::api::vTensor& v_ten,
     float offset);
 
 void record_reference_matmul(
-    api::Context* context,
-    api::vTensor& out,
-    api::vTensor& mat1,
-    api::vTensor& mat2);
+    vkcompute::api::Context* context,
+    vkcompute::api::vTensor& out,
+    vkcompute::api::vTensor& mat1,
+    vkcompute::api::vTensor& mat2);
+
+void record_matmul_texture3d(
+    vkcompute::api::Context* context,
+    vkcompute::api::vTensor& out,
+    vkcompute::api::vTensor& mat1,
+    vkcompute::api::vTensor& mat2);
 
 //
 // Input & Output Utilities
 //
 
-inline void
-fill_staging(api::StorageBuffer& staging, float val, int numel = -1) {
+inline void fill_staging(
+    vkcompute::api::StagingBuffer& staging,
+    float val,
+    int numel = -1) {
   if (numel < 0) {
     numel = staging.numel();
   }
   std::vector<float> data(numel);
   std::fill(data.begin(), data.end(), val);
-  copy_ptr_to_staging(data.data(), staging, sizeof(float) * numel);
+  staging.copy_from(data.data(), sizeof(float) * numel);
 }
 
-void fill_vtensor(api::vTensor& vten, std::vector<float>& data);
+void fill_vtensor(vkcompute::api::vTensor& vten, std::vector<float>& data);
 
-void fill_vtensor(api::vTensor& vten, float val, bool iota = false);
+void fill_vtensor(vkcompute::api::vTensor& vten, float val, bool iota = false);
 
 std::vector<float> create_random_float_buffer(
     const size_t numel,
     const float min = 0,
     const float max = 1);
 
+std::vector<uint8_t> create_random_uint8_buffer(
+    const size_t numel,
+    const uint8_t min = 0,
+    const uint8_t max = 255);
+
 void fill_vtensor(
-    ComputeGraph& graph,
-    const IOValueRef idx,
+    vkcompute::ComputeGraph& graph,
+    const vkcompute::IOValueRef idx,
     float val,
     bool iota = false);
 
-void extract_vtensor(api::vTensor& vten, std::vector<float>& data);
+void extract_vtensor(vkcompute::api::vTensor& vten, std::vector<float>& data);
 
-inline std::vector<float> extract_vtensor(api::vTensor& vten) {
+inline std::vector<float> extract_vtensor(vkcompute::api::vTensor& vten) {
   std::vector<float> data_out(vten.staging_buffer_numel());
   extract_vtensor(vten, data_out);
   return data_out;
 }
 
-inline void
-check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) {
+inline void check_staging_buffer(
+    vkcompute::api::StagingBuffer& staging,
+    float val,
+    int numel = -1) {
   if (numel < 0) {
     numel = staging.numel();
   }
   std::vector<float> data(numel);
-  copy_staging_to_ptr(staging, data.data(), sizeof(float) * numel);
+  staging.copy_to(data.data(), sizeof(float) * numel);
 
   for (size_t i = 0; i < data.size(); ++i) {
     CHECK_VALUE(data, i, val);
@@ -172,21 +191,21 @@ check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) {
 }
 
 inline int64_t get_buf_idx(
-    ComputeGraph& graph,
-    IOValueRef ref,
+    vkcompute::ComputeGraph& graph,
+    vkcompute::IOValueRef ref,
     const std::vector<int64_t>& tensor_coor) {
-  vTensorPtr vten_ptr = graph.get_tensor(ref.value);
+  vkcompute::vTensorPtr vten_ptr = graph.get_tensor(ref.value);
 
   const std::vector<int64_t>& sizes = vten_ptr->sizes();
 
-  int64_t c = dim_at<kChannel4D>(sizes);
-  int64_t h = dim_at<kHeight4D>(sizes);
-  int64_t w = dim_at<kWidth4D>(sizes);
+  int64_t c = vkcompute::dim_at<vkcompute::kChannel4D>(sizes);
+  int64_t h = vkcompute::dim_at<vkcompute::kHeight4D>(sizes);
+  int64_t w = vkcompute::dim_at<vkcompute::kWidth4D>(sizes);
 
-  int64_t ni = dim_at<kBatch4D>(tensor_coor);
-  int64_t ci = dim_at<kChannel4D>(tensor_coor);
-  int64_t hi = dim_at<kHeight4D>(tensor_coor);
-  int64_t wi = dim_at<kWidth4D>(tensor_coor);
+  int64_t ni = vkcompute::dim_at<vkcompute::kBatch4D>(tensor_coor);
+  int64_t ci = vkcompute::dim_at<vkcompute::kChannel4D>(tensor_coor);
+  int64_t hi = vkcompute::dim_at<vkcompute::kHeight4D>(tensor_coor);
+  int64_t wi = vkcompute::dim_at<vkcompute::kWidth4D>(tensor_coor);
 
   return (ni * c * h * w + ci * h * w + hi * w + wi);
 }
@@ -197,7 +216,8 @@ inline int64_t get_buf_idx(
 
 void submit_to_gpu();
 
-vkapi::Allocation allocate_memory_for(const api::vTensor& vten);
+vkcompute::vkapi::Allocation allocate_memory_for(
+    const vkcompute::api::vTensor& vten);
 
 VmaTotalStatistics get_vma_stats();
 
@@ -208,7 +228,7 @@ size_t get_vma_allocation_count();
 //
 
 void execute_graph_and_check_output(
-    ComputeGraph& graph,
+    vkcompute::ComputeGraph& graph,
     std::vector<float> input_vals,
     std::vector<float> expected_outputs);
 
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 157f995ab4c..9a99b11f758 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -11,7 +11,7 @@
 #include <utility>
 #include <vector>
 
-#include <executorch/runtime/core/portable_type/half.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
@@ -21,8 +21,11 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h>
+
 #include <executorch/backends/vulkan/test/utils/test_utils.h>
 
+using namespace vkcompute;
 using namespace vkcompute::api;
 
 std::vector<float>
@@ -177,57 +180,32 @@ TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) {
 
 TEST_F(VulkanComputeAPITest, calculate_dim_order_test) {
   // ndim, GPUMemoryLayout, expected dim order pairs
-  std::vector<std::tuple<size_t, utils::GPUMemoryLayout, std::vector<int64_t>>>
-      test_cases = {
-          {1, utils::kWidthPacked, {0}},
-          {1, utils::kHeightPacked, {0}},
-          {1, utils::kChannelsPacked, {0}},
-          {2, utils::kWidthPacked, {0, 1}},
-          {2, utils::kHeightPacked, {1, 0}},
-          {2, utils::kChannelsPacked, {0, 1}},
-          {3, utils::kWidthPacked, {0, 1, 2}},
-          {3, utils::kHeightPacked, {0, 2, 1}},
-          {3, utils::kChannelsPacked, {1, 2, 0}},
-          {4, utils::kWidthPacked, {0, 1, 2, 3}},
-          {4, utils::kHeightPacked, {0, 1, 3, 2}},
-          {4, utils::kChannelsPacked, {0, 2, 3, 1}},
-      };
+  std::vector<std::tuple<size_t, int32_t, std::vector<int64_t>>> test_cases = {
+      {1, WHCN::kWidthDim, {0}},
+      {1, WHCN::kHeightDim, {0}},
+      {1, WHCN::kChannelsDim, {0}},
+      {2, WHCN::kWidthDim, {0, 1}},
+      {2, WHCN::kHeightDim, {1, 0}},
+      {2, WHCN::kChannelsDim, {0, 1}},
+      {3, WHCN::kWidthDim, {0, 1, 2}},
+      {3, WHCN::kHeightDim, {0, 2, 1}},
+      {3, WHCN::kChannelsDim, {1, 2, 0}},
+      {4, WHCN::kWidthDim, {0, 1, 2, 3}},
+      {4, WHCN::kHeightDim, {0, 1, 3, 2}},
+      {4, WHCN::kChannelsDim, {0, 2, 3, 1}},
+  };
 
   for (const auto& test_case : test_cases) {
     const size_t& ndim = std::get<0>(test_case);
-    const utils::GPUMemoryLayout& layout = std::get<1>(test_case);
+    const int32_t packed_dim = std::get<1>(test_case);
     const auto& expected_dim_order = std::get<2>(test_case);
-    std::vector<int64_t> dim_order = calculate_dim_order(ndim, layout);
-
-    ASSERT_TRUE(dim_order == expected_dim_order);
-  }
-}
-
-TEST_F(VulkanComputeAPITest, calculate_tensor_dim_order_test) {
-  // Stride, expected dim order pairs. Note that strides don't have to "make
-  // sense" because only they are sorted; the actual stride values don't matter.
-  std::vector<std::tuple<std::vector<int64_t>, std::vector<int64_t>>>
-      test_cases = {
-          {{8, 1}, {0, 1}},
-          {{2, 10}, {1, 0}},
-          {{66, 12, 1}, {0, 1, 2}},
-          {{32, 128, 4}, {1, 0, 2}},
-          {{3, 8, 11, 212}, {3, 2, 1, 0}},
-          {{100, 12, 9, 1}, {0, 1, 2, 3}},
-          {{10, 12, 101, 6}, {2, 1, 0, 3}},
-      };
-
-  for (const auto& test_case : test_cases) {
-    const auto& strides = std::get<0>(test_case);
-    const auto& expected_dim_order = std::get<1>(test_case);
-    std::vector<int64_t> dim_order = strides_to_dim_order(strides);
+    std::vector<int64_t> dim_order = calculate_dim_order(ndim, packed_dim);
 
     ASSERT_TRUE(dim_order == expected_dim_order);
   }
 }
 
 TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
-  // vtensor to be resized
   vTensor v_tensor_to_resize(
       context(),
       {25, 25, 25, 25},
@@ -243,8 +221,9 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
     for (const auto& layout :
          {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) {
       {
+        const int32_t packed_dim = static_cast<int32_t>(layout);
         std::vector<int64_t> dim_order =
-            calculate_dim_order(sizes.size(), layout);
+            calculate_dim_order(sizes.size(), packed_dim);
         std::vector<int64_t> strides = calculate_strides(sizes, dim_order);
         std::vector<int64_t> ref_strides = get_reference_strides(sizes, layout);
         ASSERT_TRUE(strides == ref_strides);
@@ -280,26 +259,112 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
   }
 }
 
+TEST_F(VulkanComputeAPITest, virtual_transpose_test) {
+  std::vector<int64_t> sizes = {7, 9, 11, 13};
+  // (dim0, dim1), new_sizes, new_dim_order, new_axis_map, new_packed_dim_idx
+  std::vector<std::vector<std::vector<int64_t>>> test_cases = {
+      {{2, 3}, {7, 9, 13, 11}, {0, 1, 3, 2}, {1, 0, 2, 2}, {1}},
+      {{2, 1}, {7, 11, 9, 13}, {0, 2, 1, 3}, {0, 2, 1, 1}, {0}},
+      {{1, 3}, {7, 13, 11, 9}, {0, 3, 2, 1}, {2, 1, 0, 0}, {2}},
+  };
+
+  for (const auto& test_case : test_cases) {
+    const int dim0 = test_case.at(0).at(0);
+    const int dim1 = test_case.at(0).at(1);
+
+    const auto& expected_sizes = test_case.at(1);
+    const auto& expected_dim_order = test_case.at(2);
+    const auto& expected_axis_map = test_case.at(3);
+    const int expected_packed_dim = test_case.at(4).at(0);
+
+    {
+      vTensor a_buffer = vTensor(
+          context(), sizes, vkapi::kFloat, utils::kBuffer, utils::kWidthPacked);
+
+      a_buffer.virtual_transpose(dim0, dim1);
+      EXPECT_TRUE(a_buffer.sizes() == expected_sizes);
+      EXPECT_TRUE(a_buffer.dim_order() == expected_dim_order);
+    }
+
+    {
+      vTensor a_texture = vTensor(
+          context(),
+          sizes,
+          vkapi::kFloat,
+          utils::kTexture3D,
+          utils::kWidthPacked);
+      a_texture.virtual_transpose(dim0, dim1);
+      EXPECT_TRUE(a_texture.sizes() == expected_sizes);
+      EXPECT_TRUE(a_texture.axis_map() == expected_axis_map);
+      EXPECT_TRUE(a_texture.packed_dim() == expected_packed_dim);
+    }
+  }
+}
+
+utils::ivec3 make_temp_ivec3(int x, int y, int z) {
+  return utils::ivec3{x, y, z};
+}
+
 TEST_F(VulkanComputeAPITest, vec_test) {
-  utils::vec3 v3({1, 2, 3});
-  ASSERT_TRUE(v3[0] == 1);
-  ASSERT_TRUE(v3[1] == 2);
-  ASSERT_TRUE(v3[2] == 3);
-  v3 = {4, 5, 6};
-  ASSERT_TRUE(v3[0] == 4);
-  ASSERT_TRUE(v3[1] == 5);
-  ASSERT_TRUE(v3[2] == 6);
-
-  utils::uvec4 uv4({4, 3, 2, 1});
-  ASSERT_TRUE(uv4[0] == 4);
-  ASSERT_TRUE(uv4[1] == 3);
-  ASSERT_TRUE(uv4[2] == 2);
-  ASSERT_TRUE(uv4[3] == 1);
-  uv4 = {11, 13, 12, 88};
-  ASSERT_TRUE(uv4[0] == 11);
-  ASSERT_TRUE(uv4[1] == 13);
-  ASSERT_TRUE(uv4[2] == 12);
-  ASSERT_TRUE(uv4[3] == 88);
+  {
+    utils::vec3 v3({1, 2, 3});
+    ASSERT_TRUE(v3[0] == 1);
+    ASSERT_TRUE(v3[1] == 2);
+    ASSERT_TRUE(v3[2] == 3);
+    v3 = {4, 5, 6};
+    ASSERT_TRUE(v3[0] == 4);
+    ASSERT_TRUE(v3[1] == 5);
+    ASSERT_TRUE(v3[2] == 6);
+  }
+
+  {
+    utils::uvec4 uv4({4, 3, 2, 1});
+    ASSERT_TRUE(uv4[0] == 4);
+    ASSERT_TRUE(uv4[1] == 3);
+    ASSERT_TRUE(uv4[2] == 2);
+    ASSERT_TRUE(uv4[3] == 1);
+    uv4 = {11, 13, 12, 88};
+    ASSERT_TRUE(uv4[0] == 11);
+    ASSERT_TRUE(uv4[1] == 13);
+    ASSERT_TRUE(uv4[2] == 12);
+    ASSERT_TRUE(uv4[3] == 88);
+  }
+
+  // Test copy from same type
+  {
+    utils::ivec3 v{5, 6, 8};
+    utils::ivec3 v2 = v;
+
+    ASSERT_TRUE(v2[0] == 5);
+    ASSERT_TRUE(v2[1] == 6);
+    ASSERT_TRUE(v2[2] == 8);
+  }
+
+  // Test copy from different type
+  {
+    utils::uvec3 v{5, 6, 8};
+    utils::ivec3 v2 = v;
+
+    ASSERT_TRUE(v2[0] == 5);
+    ASSERT_TRUE(v2[1] == 6);
+    ASSERT_TRUE(v2[2] == 8);
+  }
+
+  // Test construction from temporary vec
+  {
+    utils::uvec3 v{make_temp_ivec3(4, 5, 10)};
+    ASSERT_TRUE(v[0] == 4);
+    ASSERT_TRUE(v[1] == 5);
+    ASSERT_TRUE(v[2] == 10);
+  }
+
+  // Test initalization from temporary vec
+  {
+    utils::uvec3 v = make_temp_ivec3(4, 5, 10);
+    ASSERT_TRUE(v[0] == 4);
+    ASSERT_TRUE(v[1] == 5);
+    ASSERT_TRUE(v[2] == 10);
+  }
 }
 
 TEST_F(VulkanComputeAPITest, retrieve_custom_shader_test) {
@@ -358,7 +423,7 @@ TEST_F(VulkanComputeAPITest, spec_var_classes_test) {
 
 TEST_F(VulkanComputeAPITest, spec_var_shader_test) {
   size_t len = 16;
-  StorageBuffer buffer(context(), vkapi::kFloat, len);
+  StagingBuffer buffer(context(), vkapi::kFloat, len);
 
   float scale = 3.0f;
   float offset = 1.5f;
@@ -382,7 +447,7 @@ TEST_F(VulkanComputeAPITest, spec_var_shader_test) {
   submit_to_gpu();
 
   std::vector<float> data(len);
-  copy_staging_to_ptr(buffer, data.data(), buffer.nbytes());
+  buffer.copy_to(data.data(), buffer.nbytes());
 
   for (size_t i = 0; i < len; ++i) {
     CHECK_VALUE(data, i, scale * i + offset);
@@ -429,7 +494,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
         params.buffer());
   }
 
-  StorageBuffer staging_buffer(
+  StagingBuffer staging_buffer(
       context(), vkapi::kFloat, a.staging_buffer_numel());
   record_image_to_nchw_op(context(), a, staging_buffer.buffer());
 
@@ -450,7 +515,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
 
 template <typename T, vkapi::ScalarType dtype>
 void test_storage_buffer_type(const size_t len) {
-  StorageBuffer buffer(context(), dtype, len);
+  StagingBuffer buffer(context(), dtype, len);
 
   std::string kernel_name("idx_fill_buffer");
   switch (dtype) {
@@ -492,7 +557,7 @@ void test_storage_buffer_type(const size_t len) {
   submit_to_gpu();
 
   std::vector<T> data(len);
-  copy_staging_to_ptr(buffer, data.data(), buffer.nbytes());
+  buffer.copy_to(data.data(), buffer.nbytes());
 
   for (size_t i = 0; i < len; ++i) {
     CHECK_VALUE(data, i, T(i));
@@ -507,7 +572,7 @@ TEST_F(VulkanComputeAPITest, test_buffer_float16) {
   if (!context()->adapter_ptr()->has_full_float16_buffers_support()) {
     GTEST_SKIP();
   }
-  test_storage_buffer_type<torch::executor::Half, vkapi::kHalf>(16);
+  test_storage_buffer_type<exec_aten::Half, vkapi::kHalf>(16);
 }
 
 TEST_F(VulkanComputeAPITest, test_buffer_int8) {
@@ -589,7 +654,7 @@ TEST_F(VulkanComputeAPITest, buffer_tensor_sanity_check) {
             run_buffer_tensor_sanity_check<float>(a);
             break;
           case vkapi::kHalf:
-            run_buffer_tensor_sanity_check<torch::executor::Half>(a);
+            run_buffer_tensor_sanity_check<exec_aten::Half>(a);
             break;
           case vkapi::kChar:
             run_buffer_tensor_sanity_check<int8_t>(a);
@@ -626,26 +691,30 @@ TEST_F(VulkanComputeAPITest, texture_add_sanity_check) {
   }
 }
 
-TEST_F(VulkanComputeAPITest, tensor_copy_test) {
-  std::vector<int64_t> sizes = {9, 9};
-  std::vector<int64_t> strides =
-      get_reference_strides(sizes, utils::kWidthPacked);
-  std::vector<int64_t> dim_order = {0, 1};
+TEST_F(VulkanComputeAPITest, tensor_alias_test) {
+  for (utils::StorageType storage_type : {utils::kTexture3D, utils::kBuffer}) {
+    std::vector<int64_t> sizes = {9, 9};
 
-  vTensor original = CREATE_FLOAT_BUFFER(sizes, /*allocate_memory=*/true);
-  vTensor copy = vTensor(original, sizes, dim_order);
-  EXPECT_TRUE(get_vma_allocation_count() == 1);
-  EXPECT_TRUE(copy.is_view_of(original));
+    const size_t alloc_count_before = get_vma_allocation_count();
 
-  // Fill original tensor with some data
-  fill_vtensor(original, 2.5f, true);
+    vTensor original = vTensor(context(), sizes, vkapi::kFloat, storage_type);
 
-  std::vector<float> data_out(copy.staging_buffer_numel());
-  // Extract the copy tensor; should contain the data of the original tensor
-  extract_vtensor(copy, data_out);
+    vTensor copy = vTensor(original);
 
-  for (size_t i = 0; i < data_out.size(); ++i) {
-    CHECK_VALUE(data_out, i, 2.5f + i);
+    // Two tensors but only one additional allocation.
+    EXPECT_TRUE(get_vma_allocation_count() == alloc_count_before + 1);
+    EXPECT_TRUE(copy.is_view_of(original));
+
+    // Fill original tensor with some data
+    fill_vtensor(original, 2.5f, true);
+
+    std::vector<float> data_out(copy.staging_buffer_numel());
+    // Extract the copy tensor; should contain the data of the original tensor
+    extract_vtensor(copy, data_out);
+
+    for (size_t i = 0; i < original.numel(); ++i) {
+      CHECK_VALUE(data_out, i, 2.5f + i);
+    }
   }
 }
 
@@ -655,46 +724,58 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
   constexpr int N = 17;
   std::vector<int64_t> mat1_sizes = {M, K};
   std::vector<int64_t> mat2_sizes = {N, K};
-  std::vector<int64_t> mat2_t_sizes = {K, N};
   std::vector<int64_t> out_sizes = {M, N};
 
-  std::vector<int64_t> transposed_dim_order = {1, 0};
-
-  vTensor mat1 = CREATE_FLOAT_BUFFER(mat1_sizes, /*allocate_memory=*/true);
-  vTensor mat2 = CREATE_FLOAT_BUFFER(mat2_sizes, /*allocate_memory=*/true);
-  vTensor out = CREATE_FLOAT_BUFFER(out_sizes, /*allocate_memory=*/true);
-
-  // Generate data
-  std::vector<float> mat1_data =
-      create_random_float_buffer(mat1.staging_buffer_numel());
-  std::vector<float> mat2_data =
-      create_random_float_buffer(mat2.staging_buffer_numel());
-
-  // Create direct view and modify sizes and strides later
-  vTensor mat2_t = vTensor(mat2);
-
-  std::vector<float> mat2_t_data = transpose_matrix(mat2_data, N, K);
-  std::vector<float> ref_out =
-      compute_reference_matmul(mat1_data, mat2_t_data, M, K, N);
-
-  // Fill original tensor with some data
-  fill_vtensor(mat1, mat1_data);
-  fill_vtensor(mat2, mat2_data);
-
-  record_reference_matmul(api::context(), out, mat1, mat2_t);
-
-  // Update sizes and strides of mat2_t to be that of a transposed tensor
-  mat2_t.virtual_reconfigure(mat2_t_sizes, transposed_dim_order);
-  EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked);
-
-  std::vector<float> data_out(out.staging_buffer_numel());
-  // Extract the copy tensor; should contain the data of the original tensor
-  extract_vtensor(out, data_out);
+  for (const auto storage_type : {utils::kTexture3D, utils::kBuffer}) {
+    vTensor mat1 = vTensor(
+        context(),
+        mat1_sizes,
+        vkapi::kFloat,
+        storage_type,
+        utils::kWidthPacked);
+    vTensor mat2 = vTensor(
+        context(),
+        mat2_sizes,
+        vkapi::kFloat,
+        storage_type,
+        utils::kWidthPacked);
+    vTensor out = vTensor(
+        context(), out_sizes, vkapi::kFloat, storage_type, utils::kWidthPacked);
+
+    // Generate data
+    std::vector<float> mat1_data =
+        create_random_float_buffer(mat1.staging_buffer_numel());
+    std::vector<float> mat2_data =
+        create_random_float_buffer(mat2.staging_buffer_numel());
+
+    // Create direct view and modify sizes and strides later
+    vTensor mat2_t = vTensor(mat2);
+    // Update sizes and strides of mat2_t to be that of a transposed tensor
+    mat2_t.virtual_transpose(0, 1);
+
+    EXPECT_TRUE(mat2_t.packed_dim() == WHCN::kHeightDim);
+
+    std::vector<float> mat2_t_data = transpose_matrix(mat2_data, N, K);
+    std::vector<float> ref_out =
+        compute_reference_matmul(mat1_data, mat2_t_data, M, K, N);
+
+    // Fill original tensor with some data
+    fill_vtensor(mat1, mat1_data);
+    fill_vtensor(mat2, mat2_data);
+
+    if (storage_type == utils::kTexture3D) {
+      record_matmul_texture3d(context(), out, mat1, mat2_t);
+    } else {
+      record_reference_matmul(context(), out, mat1, mat2_t);
+    }
 
-  EXPECT_TRUE(data_out.size() == ref_out.size());
+    std::vector<float> data_out(out.staging_buffer_numel());
+    // Extract the copy tensor; should contain the data of the original tensor
+    extract_vtensor(out, data_out);
 
-  for (size_t i = 0; i < data_out.size(); ++i) {
-    EXPECT_TRUE(check_close(data_out[i], ref_out[i]));
+    for (size_t i = 0; i < ref_out.size(); ++i) {
+      EXPECT_TRUE(check_close(data_out[i], ref_out[i]));
+    }
   }
 }
 
@@ -904,64 +985,6 @@ TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) {
   EXPECT_THROW(fill_vtensor(a, data_a), vkapi::Error);
 }
 
-TEST_F(VulkanComputeAPITest, tensor_reallocation_test) {
-  std::vector<int64_t> sizes = {4, 4, 1};
-  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-  vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-  vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-
-  execute_and_check_add(a, b, c, 3.0f, 5.0f);
-
-  // Redo with new sizes
-  std::vector<int64_t> new_sizes = {4, 6, 3};
-  a.reallocate(new_sizes);
-  b.reallocate(new_sizes);
-  c.reallocate(new_sizes);
-
-  // Flush everything
-  context()->flush();
-
-  execute_and_check_add(a, b, c, 12.0f, 10.0f);
-}
-
-TEST_F(
-    VulkanComputeAPITest,
-    tensor_reallocation_with_deferred_allocation_test) {
-  std::vector<int64_t> sizes = {8, 8, 8};
-  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-  vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-  vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-
-  vkapi::Allocation a_mem = allocate_memory_for(a);
-  a.image().bind_allocation(a_mem);
-  vkapi::Allocation b_mem = allocate_memory_for(b);
-  b.image().bind_allocation(b_mem);
-  vkapi::Allocation c_mem = allocate_memory_for(c);
-  c.image().bind_allocation(c_mem);
-
-  execute_and_check_add(a, b, c, 4.0f, 8.0f);
-
-  std::vector<std::vector<int64_t>> new_sizes_list = {
-      {4, 3, 5}, {4, 1, 7}, {8, 3, 2}, {8, 7, 2}};
-
-  for (auto& new_sizes : new_sizes_list) {
-    // Redo with new sizes
-    a.reallocate(new_sizes);
-    b.reallocate(new_sizes);
-    c.reallocate(new_sizes);
-
-    // Flush everything
-    context()->flush();
-
-    a.image().bind_allocation(a_mem);
-    b.image().bind_allocation(b_mem);
-    c.image().bind_allocation(c_mem);
-
-    execute_and_check_add(
-        a, b, c, float(new_sizes[1] + 4.5f), float(new_sizes[2] + 13.0f));
-  }
-}
-
 TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
   context()->set_cmd(/*reusable = */ true);
   std::vector<int64_t> sizes = {8, 12, 12};
@@ -1014,6 +1037,34 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
       graph.get_tensor(name.value)->staging_buffer_numel()); \
   graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size());
 
+// The purpose of this test is simply to track the size of various classes over
+// time, in the interest of making sure that they doesn't grow too large.
+TEST_F(VulkanComputeAPITest, print_object_sizes) {
+#define PRINT_SIZE(name) \
+  std::cout << #name << " size: " << sizeof(name) << " B" << std::endl
+  PRINT_SIZE(vTensor);
+  PRINT_SIZE(Value);
+  PRINT_SIZE(StagingBuffer);
+  PRINT_SIZE(ComputeGraph);
+  PRINT_SIZE(ExecuteNode);
+#undef PRINT_SIZE
+
+  // The actual sizes of each object is dependent on the platform. However, we
+  // can alert ourselves to any significant changes in the sizes of these
+  // objects by checking the `sizeof()` the class against some loose thresholds.
+
+  // Current known size on 64 bit system: 1040 B
+  EXPECT_TRUE(sizeof(vTensor) < 1200);
+  // Current known size on 64 bit system: 1056 B
+  EXPECT_TRUE(sizeof(Value) < 1200);
+  // Current known size on 64 bit system: 120 B
+  EXPECT_TRUE(sizeof(StagingBuffer) < 500);
+  // Current known size on 64 bit system: 384 B
+  EXPECT_TRUE(sizeof(ComputeGraph) < 500);
+  // Current known size on 64 bit system: 248 B
+  EXPECT_TRUE(sizeof(ExecuteNode) < 500);
+}
+
 TEST(VulkanComputeGraphTest, test_values_scalars) {
   GraphConfig config;
   ComputeGraph graph(config);
@@ -1227,8 +1278,8 @@ TEST(VulkanComputeGraphTest, test_simple_graph) {
   GraphConfig config;
   ComputeGraph graph(config);
 
-  std::vector<int64_t> size_big = {8, 64, 124};
-  std::vector<int64_t> size_small = {8, 1, 124};
+  std::vector<int64_t> size_big = {1, 8, 8};
+  std::vector<int64_t> size_small = {1, 1, 8};
 
   // Build graph
 
@@ -1268,6 +1319,64 @@ TEST(VulkanComputeGraphTest, test_simple_graph) {
   }
 }
 
+TEST(VulkanComputeGraphTest, test_simple_graph_with_symint) {
+  GraphConfig config;
+  config.set_storage_type_override(utils::kTexture3D);
+  ComputeGraph graph(config);
+
+  std::vector<int64_t> sizes = {8, 64, 124};
+
+  // Build graph
+
+  ValueRef scalar = graph.add_symint(1);
+  IOValueRef a = graph.add_input_tensor(sizes, vkapi::kFloat);
+
+  IOValueRef out = {};
+  out.value = a.value;
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR("scalar_add_texture"),
+      graph.create_global_wg_size(a.value),
+      graph.create_local_wg_size(a.value),
+      // Inputs and Outputs
+      {{out.value, vkapi::MemoryAccessType::WRITE}},
+      // Shader params buffers
+      {graph.logical_limits_ubo(a.value),
+       graph.get_or_create_int_param_buffer(scalar)},
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      nullptr,
+      {}));
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_execute();
+
+  // Run graph
+
+  for (float i = 5.0f; i < 30.0f; i += 10.0f) {
+    int scalar_val = i - 3.0f;
+    graph.set_symint(scalar, scalar_val);
+
+    float val_a = i + 2.0f;
+    float val_out = val_a + scalar_val;
+
+    fill_vtensor(graph, a, val_a);
+
+    graph.execute();
+
+    EXTRACT_TENSOR(out);
+
+    // Sanity check that the values are correct
+    for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
+      CHECK_VALUE(data_out, i, val_out);
+    }
+  }
+}
+
 #define CREATE_WEIGHT_TENSOR(name, sizes, dtype, val)              \
   std::vector<float> data_##name(utils::multiply_integers(sizes)); \
   std::fill(data_##name.begin(), data_##name.end(), val);          \
@@ -1335,6 +1444,7 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) {
 TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   GraphConfig config;
   ComputeGraph graph(config);
+  size_t expected_vma_allocation_count = 0;
 
   std::vector<int64_t> size_big = {12, 64, 64};
   std::vector<int64_t> size_small = {12, 64, 64};
@@ -1351,8 +1461,10 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       /*shared_object_idx = */ 4);
 
   // +2: t.sizes_ubo() for each staging shader
+  // +2: t.axis_map_ubo() for each staging shader
   // +2: staging buffer for each input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 4);
+  expected_vma_allocation_count += 6;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   ValueRef c = graph.add_tensor(
       size_big,
@@ -1362,15 +1474,22 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   auto addFn = VK_GET_OP_FN("aten.add.Tensor");
   addFn(graph, {a.value, b.value, kDummyValueRef, c});
 
+  // +2: alpha UBO, broadcast UBO for arithmetic shader
+  // +1: t.sizes_ubo() for arithmetic shader output c
+  // +1: t.axis_map_ubo() for arithmetic shader output c
+  expected_vma_allocation_count += 4;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
+
   IOValueRef d = graph.add_input_tensor(
       size_small,
       vkapi::kFloat,
       /*shared_object_idx = */ 2);
 
-  // +2: alpha UBO, broadcast UBO for arithmetic shader
   // +1: t.sizes_ubo() uniform buffer for staging shader
+  // +1: t.axis_map_ubo() uniform buffer for staging shader
   // +1: staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 9);
+  expected_vma_allocation_count += 3;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   ValueRef e = graph.add_tensor(
       size_big,
@@ -1380,20 +1499,26 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   auto mulFn = VK_GET_OP_FN("aten.mul.Tensor");
   mulFn(graph, {c, d.value, e});
 
+  // +2: alpha UBO, broadcast UBO for arithmetic shader
+  // +1: t.sizes_ubo() for arithmetic shader output e
+  // +1: t.axis_map_ubo() for arithmetic shader output e
+  expected_vma_allocation_count += 4;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
+
   IOValueRef out = {};
   out.value = e;
   out.staging = graph.set_output_tensor(out.value);
 
-  // +2: alpha UBO, broadcast UBO for arithmetic shader
-  // +1: t.sizes_ubo() for staging shader
-  // +1 staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 13);
+  // +1: staging buffer for the output tensor
+  expected_vma_allocation_count += 1;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   graph.prepare();
   graph.encode_execute();
 
   // +3: shared memory allocations for tensors
-  EXPECT_TRUE(get_vma_allocation_count() == 16);
+  expected_vma_allocation_count += 3;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   // Run graph
 
@@ -1460,6 +1585,105 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   }
 }
 
+TEST(VulkanComputeGraphTest, test_simple_graph_with_tmp_tensors) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  std::vector<int64_t> size_big = {8, 64, 124};
+  std::vector<int64_t> size_small = {8, 1, 124};
+
+  // Build graph
+
+  IOValueRef a = graph.add_input_tensor(
+      size_big, vkapi::kFloat, /*shared_object_idx = */ 0);
+  IOValueRef b = graph.add_input_tensor(
+      size_small, vkapi::kFloat, /*shared_object_idx = */ 1);
+
+  IOValueRef out = {};
+
+  out.value =
+      graph.add_tensor(size_big, vkapi::kFloat, /*shared_object_idx = */ 2);
+
+  // Perform the following compute
+  //
+  // a, b, out;
+  // {
+  //   inter;
+  //   {
+  //     tmp = a + b
+  //     tmp2 = tmp + a
+  //     inter = tmp2 + b
+  //   }
+  //   {
+  //     tmp = inter + b;
+  //     tmp2 = tmp + a
+  //     out = tmp2 + b;
+  //   }
+  // }
+  {
+    TmpTensor inter(&graph, size_big, vkapi::kFloat);
+    EXPECT_TRUE(inter.sobj_idx == 3);
+    {
+      TmpTensor tmp(&graph, size_big, vkapi::kFloat);
+      EXPECT_TRUE(tmp.sobj_idx == 4);
+      VK_GET_OP_FN("aten.add.Tensor")
+      (graph, {a, b, kDummyValueRef, tmp});
+
+      TmpTensor tmp2(&graph, size_big, vkapi::kFloat);
+      EXPECT_TRUE(tmp2.sobj_idx == 5);
+      VK_GET_OP_FN("aten.add.Tensor")
+      (graph, {tmp, a, kDummyValueRef, tmp2});
+
+      VK_GET_OP_FN("aten.add.Tensor")
+      (graph, {tmp2, b, kDummyValueRef, inter});
+    }
+    {
+      TmpTensor tmp(&graph, size_big, vkapi::kFloat);
+      EXPECT_TRUE(tmp.sobj_idx == 4);
+      VK_GET_OP_FN("aten.add.Tensor")
+      (graph, {inter, b, kDummyValueRef, tmp});
+
+      TmpTensor tmp2(&graph, size_big, vkapi::kFloat);
+      EXPECT_TRUE(tmp2.sobj_idx == 5);
+      VK_GET_OP_FN("aten.add.Tensor")
+      (graph, {tmp, a, kDummyValueRef, tmp2});
+
+      VK_GET_OP_FN("aten.add.Tensor")
+      (graph, {tmp2, b, kDummyValueRef, out});
+    }
+  }
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_execute();
+
+  // Run graph
+
+  for (float i = 5.0f; i < 30.0f; i += 10.0f) {
+    float val_a = i + 2.0f;
+    float val_b = i + 1.5f;
+    float val_tmp = val_a + val_b;
+    float val_tmp2 = val_tmp + val_a;
+    float val_inter = val_tmp2 + val_b;
+    float val_tmp_2 = val_inter + val_b;
+    float val_tmp2_2 = val_tmp_2 + val_a;
+    float val_out = val_tmp2_2 + val_b;
+
+    fill_vtensor(graph, a, val_a);
+    fill_vtensor(graph, b, val_b);
+
+    graph.execute();
+
+    EXTRACT_TENSOR(out);
+
+    // Sanity check that the values are correct
+    for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
+      CHECK_VALUE(data_out, i, val_out);
+    }
+  }
+}
+
 TEST(VulkanComputeGraphTest, test_large_graph) {
   auto build_start_time = std::chrono::system_clock::now();
   GraphConfig config;
@@ -2050,9 +2274,9 @@ void run_from_gpu_test(
     context()->submit_compute_job(
         VK_KERNEL_FROM_STR(kernel_name),
         pipeline_barrier,
-        vten.image_extents(),
+        vten.logical_limits(),
         {4, 4, 4},
-        {vten.packed_dim_whcn_idx(), offset},
+        {vten.packed_dim(), offset},
         VK_NULL_HANDLE,
         0,
         vten.image(
@@ -2062,7 +2286,7 @@ void run_from_gpu_test(
         vten.sizes_ubo());
   }
 
-  StorageBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel());
+  StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel());
 
   if (dtype == vkapi::kChar &&
       !context()->adapter_ptr()->has_full_int8_buffers_support()) {
@@ -2074,7 +2298,7 @@ void run_from_gpu_test(
   submit_to_gpu();
 
   std::vector<T> data_out(staging_buffer.numel());
-  copy_staging_to_ptr(staging_buffer, data_out.data(), staging_buffer.nbytes());
+  staging_buffer.copy_to(data_out.data(), staging_buffer.nbytes());
 
   for (int i = 0; i < vten.numel(); i++) {
     CHECK_VALUE(data_out, i, i + offset);
@@ -2095,18 +2319,17 @@ void round_trip_test(
   vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout);
 
   // Create and fill input staging buffer
-  StorageBuffer staging_buffer_in(
+  StagingBuffer staging_buffer_in(
       context(), dtype, vten.staging_buffer_numel());
 
   std::vector<T> data_in(staging_buffer_in.numel());
   for (int i = 0; i < staging_buffer_in.numel(); i++) {
     data_in[i] = T(i * -1);
   }
-  copy_ptr_to_staging(
-      data_in.data(), staging_buffer_in, vten.staging_buffer_nbytes());
+  staging_buffer_in.copy_from(data_in.data(), vten.staging_buffer_nbytes());
 
   // Output staging buffer
-  StorageBuffer staging_buffer_out(
+  StagingBuffer staging_buffer_out(
       context(), dtype, vten.staging_buffer_numel());
 
   record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten);
@@ -2124,8 +2347,7 @@ void round_trip_test(
 
   // Extract data from output staging buffer
   std::vector<T> data_out(staging_buffer_out.numel());
-  copy_staging_to_ptr(
-      staging_buffer_out, data_out.data(), staging_buffer_out.nbytes());
+  staging_buffer_out.copy_to(data_out.data(), staging_buffer_out.nbytes());
 
   // All indices should be equal to the input data
   for (int i = 0; i < vten.numel(); i++) {
@@ -2231,7 +2453,7 @@ TEST(VulkanToFromGPUShaderTest, round_trip_tests) {
 
   for (auto& sizes : to_test) {
     RUN_TESTS(float, vkapi::kFloat)
-    RUN_TESTS(torch::executor::Half, vkapi::kHalf)
+    RUN_TESTS(exec_aten::Half, vkapi::kHalf)
   }
 
   for (auto& sizes : to_test_int8) {
@@ -2451,6 +2673,7 @@ TEST(VulkanComputeGraphOpsTest, mm_smoke_test) {
       prepack);
 
   CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS);
+  CALL_TEST_FN_FOR_C_PACKED(RUN_TESTS);
 
 #undef RUN_TESTS
 }
@@ -2559,19 +2782,18 @@ void test_conv2d(
 
   // Create and fill input staging buffer
   const int64_t in_numel = utils::multiply_integers(original_sizes);
-  StorageBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel);
+  StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel);
 
   std::vector<float> data_in(in_numel);
   for (int i = 0; i < in_numel; i++) {
     data_in[i] = i + 1;
   }
-  copy_ptr_to_staging(
-      data_in.data(), staging_buffer_in, sizeof(float) * in_numel);
+  staging_buffer_in.copy_from(data_in.data(), sizeof(float) * in_numel);
 
   // Output staging buffer
   const int64_t out_numel =
       padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3];
-  StorageBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel);
+  StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel);
 
   // Copy data in and out of the tensor
   record_conv2d_prepack_weights_op(
@@ -2583,8 +2805,7 @@ void test_conv2d(
 
   // Extract data from output staging buffer
   std::vector<float> data_out(out_numel);
-  copy_staging_to_ptr(
-      staging_buffer_out, data_out.data(), sizeof(float) * out_numel);
+  staging_buffer_out.copy_to(data_out.data(), sizeof(float) * out_numel);
 
   // Check data matches results copied from ATen-VK
   for (int i = 0; i < vten.numel(); i++) {
@@ -2683,13 +2904,150 @@ TEST(VulkanComputeGraphOpsTest, grid_priors_test) {
       /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12});
 }
 
+void test_int4pack_mm(
+    std::vector<uint32_t> MKN,
+    uint32_t group_size,
+    utils::StorageType storage_type) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  const uint32_t M = MKN[0];
+  const uint32_t K = MKN[1];
+  const uint32_t N = MKN[2];
+
+  const std::vector<int64_t> mat1_size = {M, K};
+  const std::vector<int64_t> mat2_size = {K, N};
+  const std::vector<int64_t> mat2_q_size = {N, K / 2}; // Transposed and packed
+  const std::vector<int64_t> out_size = {M, N};
+
+  std::vector<float> A_data = create_random_float_buffer(M * K);
+  IOValueRef A = graph.add_input_tensor(mat1_size, vkapi::kFloat, storage_type);
+  graph.copy_into_staging(A.staging, A_data.data(), A_data.size());
+
+  // Quantized but un-packed weights
+  std::vector<uint8_t> B_quant_data = create_random_uint8_buffer(K * N, 0, 16);
+
+  // Pack and transpose weights to correspond to int4 weight format
+  std::vector<uint8_t> B_int4_data =
+      int4mm_pack_weights(mat2_size, B_quant_data.data());
+
+  IOValueRef B_int4 =
+      graph.add_input_tensor(mat2_q_size, vkapi::kQInt8, utils::kBuffer);
+  graph.copy_into_staging(
+      B_int4.staging, B_int4_data.data(), B_int4_data.size());
+
+  const int k_groups = K / group_size;
+
+  // Random scales and zeroes. Keep scales small to avoid overflow and zeroes in
+  // int4 range
+  IOValueRef scales_and_zeros;
+
+  if (storage_type == utils::kBuffer) {
+    scales_and_zeros.value = graph.add_tensor(
+        {2, N, k_groups}, vkapi::kFloat, storage_type, utils::kWidthPacked);
+  } else {
+    scales_and_zeros.value = graph.add_tensor(
+        {2, N, k_groups}, vkapi::kFloat, storage_type, utils::kChannelsPacked);
+  }
+
+  scales_and_zeros.staging = graph.set_input_tensor(scales_and_zeros.value);
+
+  std::vector<float> s_data(graph.numel_of(scales_and_zeros.value));
+  const int zeros_stride = s_data.size() / 2;
+  for (size_t i = 0; i < zeros_stride; i++) {
+    s_data[i] = rand() % 100;
+    s_data[i + zeros_stride] = rand() % 16;
+  }
+
+  graph.copy_into_staging(
+      scales_and_zeros.staging, s_data.data(), s_data.size());
+
+  IOValueRef out_int4;
+
+  if (storage_type == utils::kBuffer) {
+    out_int4.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kBuffer);
+  } else {
+    out_int4.value =
+        graph.add_tensor(out_size, vkapi::kFloat, utils::kChannelsPacked);
+  }
+
+  VK_GET_OP_FN("aten._weight_int4pack_mm.default")
+  (graph,
+   {A.value,
+    B_int4.value,
+    graph.add_scalar<int64_t>(group_size),
+    scales_and_zeros.value,
+    out_int4.value});
+
+  out_int4.staging = graph.set_output_tensor(out_int4.value);
+
+  // Dequantized matmul for comparison
+  IOValueRef B_deq =
+      graph.add_input_tensor(mat2_size, vkapi::kFloat, storage_type);
+  std::vector<float> B_deq_data = int4mm_dequantize_weights(
+      mat2_size, B_quant_data.data(), group_size, s_data.data());
+  graph.copy_into_staging(B_deq.staging, B_deq_data.data(), B_deq_data.size());
+
+  IOValueRef out_deq;
+  out_deq.value = graph.add_tensor(out_size, vkapi::kFloat, storage_type);
+
+  VK_GET_OP_FN("aten.mm.default")
+  (graph, {A.value, B_deq.value, out_deq.value});
+
+  out_deq.staging = graph.set_output_tensor(out_deq.value);
+
+  graph.prepare();
+  graph.encode_prepack();
+  graph.prepack();
+  graph.encode_execute();
+  graph.propagate_resize();
+  graph.execute();
+
+  // Compare outputs
+  std::vector<float> out_int4_data(graph.numel_of(out_int4.value));
+  graph.copy_from_staging(
+      out_int4.staging, out_int4_data.data(), out_int4_data.size());
+
+  std::vector<float> out_deq_data(graph.numel_of(out_deq.value));
+  graph.copy_from_staging(
+      out_deq.staging, out_deq_data.data(), out_deq_data.size());
+
+  for (int i = 0; i < out_int4_data.size(); i++) {
+    EXPECT_TRUE(check_close(out_int4_data[i], out_deq_data[i]));
+  }
+}
+
+TEST(VulkanComputeGraphOpsTest, int4pack_mm_test) {
+  if (!context()->adapter_ptr()->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+
+  for (auto storage_type : {utils::kBuffer, utils::kTexture3D}) {
+    // Vector multiplication, single group per row
+    test_int4pack_mm({1, 32, 1}, 32, storage_type);
+
+    // Vector multiplication, multiple groups per row
+    test_int4pack_mm({1, 256, 1}, 64, storage_type);
+
+    // Square matrices, single group per row
+    test_int4pack_mm({32, 32, 32}, 32, storage_type);
+
+    // Irregular matrices, single group per row
+    test_int4pack_mm({37, 32, 19}, 32, storage_type);
+
+    // Irregular matrices, multiple groups per row
+    test_int4pack_mm({37, 256, 19}, 64, storage_type);
+  }
+}
+
 void test_transpose_view_mm(
     const int B,
     const int M,
     const int K,
-    const int N) {
+    const int N,
+    utils::StorageType storage_type) {
   GraphConfig config;
-  config.set_storage_type_override(utils::kBuffer);
+  config.set_storage_type_override(storage_type);
   ComputeGraph graph(config);
 
   std::vector<int64_t> mat1_size = {M, K};
@@ -2717,10 +3075,10 @@ void test_transpose_view_mm(
 
   IOValueRef mat1 =
       graph.add_input_tensor(mat1_size, vkapi::kFloat, utils::kWidthPacked);
-  IOValueRef mat2_t =
+  IOValueRef mat2_transpose =
       graph.add_input_tensor(mat2_t_size, vkapi::kFloat, utils::kWidthPacked);
 
-  ValueRef mat2 = graph.add_tensor_view(mat2_t.value);
+  ValueRef mat2 = graph.add_tensor_view(mat2_transpose.value);
 
   ValueRef dim0;
   ValueRef dim1;
@@ -2736,7 +3094,8 @@ void test_transpose_view_mm(
   IOValueRef out;
   out.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kWidthPacked);
 
-  VK_GET_OP_FN("aten.transpose.int")(graph, {mat2_t.value, dim0, dim1, mat2});
+  VK_GET_OP_FN("aten.transpose.int")
+  (graph, {mat2_transpose.value, dim0, dim1, mat2});
   VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2, out.value});
 
   out.staging = graph.set_output_tensor(out.value);
@@ -2767,5 +3126,7 @@ void test_transpose_view_mm(
 }
 
 TEST(VulkanComputeGraphOpsTest, test_transpose_with_mm) {
-  test_transpose_view_mm(2, 7, 17, 5);
+  for (auto storage_type : {utils::kBuffer, utils::kTexture3D}) {
+    test_transpose_view_mm(2, 7, 17, 5, storage_type);
+  }
 }
diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h
index 0d312ee87c3..9af908eb170 100644
--- a/backends/vulkan/tools/gpuinfo/include/architecture.h
+++ b/backends/vulkan/tools/gpuinfo/include/architecture.h
@@ -40,7 +40,7 @@ void reg_count(const App& app) {
   uint32_t NITER;
 
   auto bench = [&](uint32_t ngrp, uint32_t nreg) {
-    StorageBuffer buffer(context(), vkapi::kFloat, 1);
+    StagingBuffer buffer(context(), vkapi::kFloat, 1);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "reg_count_" + std::to_string(nreg);
@@ -164,7 +164,7 @@ void warp_size(const App& app, const bool verbose = false) {
   uint32_t NITER;
 
   auto bench = [&](uint32_t nthread) {
-    StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
+    StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "warp_size_physical";
@@ -224,7 +224,7 @@ void warp_size(const App& app, const bool verbose = false) {
   // doesn't depend on kernel timing, so the extra wait time doesn't lead to
   // inaccuracy.
   auto bench_sm = [&](uint32_t nthread) {
-    StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
+    StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "warp_size_scheduler";
@@ -242,7 +242,7 @@ void warp_size(const App& app, const bool verbose = false) {
     });
 
     std::vector<int32_t> data(app.nthread_logic);
-    copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes());
+    out_buf.copy_to(data.data(), out_buf.nbytes());
 
     if (verbose) {
       std::stringstream ss;
diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h
index c8cf93c4a12..31137b11eea 100644
--- a/backends/vulkan/tools/gpuinfo/include/buffers.h
+++ b/backends/vulkan/tools/gpuinfo/include/buffers.h
@@ -35,8 +35,8 @@ void buf_cacheline_size(const App& app) {
   uint32_t NITER;
 
   auto bench = [&](int stride) {
-    StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
-    StorageBuffer out_buf(context(), vkapi::kFloat, 1);
+    StagingBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
+    StagingBuffer out_buf(context(), vkapi::kFloat, 1);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "buf_cacheline_size";
@@ -132,8 +132,8 @@ void _bandwidth(
     // workgroups, once the size of the access excedes the workgroup width.
     const uint32_t workgroup_width = local_x * NITER * NUNROLL;
 
-    StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
-    StorageBuffer out_buf(
+    StagingBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
+    StagingBuffer out_buf(
         context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
     vkapi::PipelineBarrier pipeline_barrier{};
 
diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h
index 7679f11b0ca..c9ff133f1ec 100644
--- a/backends/vulkan/tools/gpuinfo/include/textures.h
+++ b/backends/vulkan/tools/gpuinfo/include/textures.h
@@ -61,7 +61,7 @@ void tex_cacheline_concurr(const App& app) {
       vTensor in_tensor =
           api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
 
-      StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
+      StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
 
       vkapi::PipelineBarrier pipeline_barrier{};
 
@@ -173,7 +173,7 @@ void tex_bandwidth(const App& app) {
       // workgroups, once the size of the access excedes the workgroup width.
       const uint32_t workgroup_width = local_x * NITER * NUNROLL;
 
-      StorageBuffer out_buf(
+      StagingBuffer out_buf(
           context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
       vkapi::PipelineBarrier pipeline_barrier{};
 
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index 1865c32acd7..7e85c25faee 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -57,7 +57,7 @@ def preprocess(  # noqa: C901
             MeanToSumDiv(),
             SpecPropPass(),
             ConstraintBasedSymShapeEvalPass(),
-            MemoryPlanningPass("greedy"),
+            MemoryPlanningPass(),
         ]
 
         new_gm = program.graph_module
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index a5b12d65799..c22f029c263 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -32,9 +32,11 @@ if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
 
-# NB: Enabling this will serialize execution of delegate instances
-# Keeping this OFF by default to maintain existing behavior, to be revisited.
-option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE "Enable workspace sharing across different delegate instances" OFF)
+# NB: Enabling this will serialize execution of delegate instances.
+# This setting may have performance implications.
+option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
+       "Enable workspace sharing across different delegate instances" ON
+)
 if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
   add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
 endif()
diff --git a/backends/xnnpack/README.md b/backends/xnnpack/README.md
index 33a0bfaf309..0c3d7e14428 100644
--- a/backends/xnnpack/README.md
+++ b/backends/xnnpack/README.md
@@ -105,9 +105,10 @@ mkdir cmake-out
 cmake \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DPYTHON_EXECUTABLE=python \
     -Bcmake-out .
diff --git a/backends/xnnpack/cmake/Dependencies.cmake b/backends/xnnpack/cmake/Dependencies.cmake
index 40e4e72c38b..b76c54bee60 100644
--- a/backends/xnnpack/cmake/Dependencies.cmake
+++ b/backends/xnnpack/cmake/Dependencies.cmake
@@ -36,6 +36,10 @@ set(XNNPACK_ENABLE_AVXVNNI
     OFF
     CACHE BOOL ""
 )
+set(XNNPACK_ENABLE_KLEIDIAI
+    OFF
+    CACHE BOOL ""
+)
 add_subdirectory("${XNNPACK_SOURCE_DIR}")
 include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR})
 list(APPEND xnnpack_third_party XNNPACK)
diff --git a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py
index d47f9f479e4..f8f0c54ee68 100644
--- a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py
+++ b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py
@@ -12,7 +12,15 @@
     register_node_visitor,
 )
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import XNNGraph
-from executorch.backends.xnnpack.utils.utils import get_input_node
+from executorch.backends.xnnpack.utils.quant_utils import (
+    is_per_channel_group,
+    is_per_token,
+)
+from executorch.backends.xnnpack.utils.utils import (
+    check_or_raise,
+    get_input_node,
+    is_param_node,
+)
 
 
 @register_node_visitor
@@ -65,3 +73,40 @@ def define_node(
         dq_input = get_input_node(node, 0)
         if dq_input in vals_to_ids:
             vals_to_ids[node] = vals_to_ids[dq_input]
+
+
+@register_node_visitor
+class OpDequantizeAffine(NodeVisitor):
+    target = "quant.dequantize_affine.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+        """
+        We always define dequantize affine nodes because they are always explicit
+        """
+        if is_per_channel_group(node):
+            check_or_raise(
+                is_param_node(self._exported_program, node.all_input_nodes[0]),
+                f"Expected quantize affine node with per-token semantics to be used "
+                f"in front of a weight node, but found node {node.all_input_nodes[0]}",
+            )
+            # Affine dequantize was recognized as per channel group which means that it should
+            # be skipped as this means it is used in front of a weight node
+            return
+
+        check_or_raise(
+            is_per_token(node),
+            "Expecting Affine Dequantized Op to have per-token semantics",
+        )
+        # This must be a per-token affine dequantized node, so let us serialize as such
+        dq_input = get_input_node(node, 0)
+        if dq_input in vals_to_ids:
+            vals_to_ids[node] = vals_to_ids[dq_input]
diff --git a/backends/xnnpack/operators/op_dynamic_quantize_ops.py b/backends/xnnpack/operators/op_dynamic_quantize_ops.py
index bf5f3b7b092..23047e731f7 100644
--- a/backends/xnnpack/operators/op_dynamic_quantize_ops.py
+++ b/backends/xnnpack/operators/op_dynamic_quantize_ops.py
@@ -17,6 +17,10 @@
     XNNGraph,
     XNode,
 )
+from executorch.backends.xnnpack.utils.quant_utils import (
+    is_per_channel_group,
+    is_per_token,
+)
 from executorch.backends.xnnpack.utils.utils import check_or_raise, get_input_node
 
 
@@ -118,3 +122,56 @@ def define_node(
             debug_handle=debug_handle,
         )
         xnn_graph.xnodes.append(ser_node)
+
+
+@register_node_visitor
+class OpQuantizeAffine(NodeVisitor):
+    target = "quant.quantize_affine.default"
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+        """
+        We always define quantize affine nodes because they are always explicit
+        """
+        if is_per_channel_group(node):
+            # Affine quantized was recognized as per channel group which means that it should
+            # be skipped as this means it is used in front of a weight node
+            return
+
+        check_or_raise(
+            is_per_token(node),
+            "Encountered affine quantized op which does not have per-token semantics",
+        )
+        # Treat this node as dynamic per-token quantization
+        q_input = get_input_node(node, 0)
+
+        # fp32 input
+        self.define_tensor(q_input, xnn_graph, vals_to_ids)
+        input_id = vals_to_ids[q_input]
+
+        # dynamic quantized output
+        input_quant_params = QuantParams.from_q_dq_node(node)
+        # qinput isn't needed for dynamically quantized nodes since it will always be
+        # the output of a convert node. Instead we set q_input to the node itself so
+        # we can extract the shape from the dq output
+        input_quant_params.q_input = node
+        input_quant_params.is_input = False
+        check_or_raise(
+            input_quant_params.is_dynamic,
+            "Internal Error, dynamically quantized node expected dynamic quantized params",
+        )
+        self.define_tensor(
+            node, xnn_graph, vals_to_ids, quant_params=input_quant_params
+        )
+        output_id = vals_to_ids[node]
+
+        ser_node = XNode(
+            xnode_union=XNNConvert(input_id=input_id, output_id=output_id, flags=0),
+            debug_handle=debug_handle,
+        )
+        xnn_graph.xnodes.append(ser_node)
diff --git a/backends/xnnpack/operators/op_skip_ops.py b/backends/xnnpack/operators/op_skip_ops.py
index d6a54c901eb..6597c0568e3 100644
--- a/backends/xnnpack/operators/op_skip_ops.py
+++ b/backends/xnnpack/operators/op_skip_ops.py
@@ -97,6 +97,15 @@ class OpSymSizeInt(OpSkipOps):
     target = "sym_size.int"
 
 
+@register_node_visitor
+class OpChooseQparamsAffine(OpSkipOps):
+    """
+    do nothing if node is choose_qparams_affine.default
+    """
+
+    target = "quant.choose_qparams_affine.default"
+
+
 @register_node_visitor
 class OpChooseQparamsToken(OpSkipOps):
     """
diff --git a/backends/xnnpack/operators/quant_params.py b/backends/xnnpack/operators/quant_params.py
index d60c300276f..44908ac7fca 100644
--- a/backends/xnnpack/operators/quant_params.py
+++ b/backends/xnnpack/operators/quant_params.py
@@ -10,7 +10,15 @@
 
 import torch
 from executorch.backends.xnnpack.passes.tag_implicit_q_dq_pass import TagImplicitQDqPass
-from executorch.backends.xnnpack.utils.quant_utils import is_dequant, is_quant
+from executorch.backends.xnnpack.utils.quant_utils import (
+    extract_qdq_affine_op_args_for_decomposed_ops,
+    is_affine_qdq,
+    is_dequant,
+    is_dynamic_qdq,
+    is_per_channel,
+    is_per_channel_group,
+    is_quant,
+)
 from executorch.backends.xnnpack.utils.utils import (
     check_or_raise,
     get_param_tensor,
@@ -154,30 +162,18 @@ def from_q_dq_node(
         q_input = quant_node.all_input_nodes[0]
 
         # TODO: Use presence of choose_qparam node to determine if this is a dynamic quantization
-        if quant_node.target in [
-            exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
-            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
-            exir_ops.edge.quantized_decomposed.quantize_per_token.default,
-            exir_ops.edge.quantized_decomposed.dequantize_per_token.default,
-        ]:
+        if is_dynamic_qdq(quant_node):
             return cls._from_dynamic_input_node(quant_node)
 
-        per_channel = quant_node.target in [
-            exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
-            exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
-        ]
-
-        _groupwise = False
-        if quant_node.target in [
-            exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default,
-            exir_ops.edge.quantized_decomposed.dequantize_per_channel_group.default,
-        ]:
-            # This is a sub-category of per channel quantization
-            per_channel = True
-            _groupwise = True
-
-        scale = quant_node.args[1]
-        zp = quant_node.args[2]
+        per_channel = is_per_channel(quant_node)
+
+        _groupwise = is_per_channel_group(quant_node)
+        quant_node_args = quant_node.args
+        if _groupwise and is_affine_qdq(quant_node):
+            quant_node_args = extract_qdq_affine_op_args_for_decomposed_ops(quant_node)
+
+        scale = quant_node_args[1]
+        zp = quant_node_args[2]
         axis = 0
         if per_channel:
             assert isinstance(scale, torch.fx.Node) and isinstance(scale.target, str)
@@ -193,10 +189,15 @@ def _get_tensor(node):
 
             scale = _get_tensor(scale)
             zp = _get_tensor(zp)
-            axis = cast(int, quant_node.args[3])
+            axis = cast(int, quant_node_args[3])
 
             if _groupwise:
                 scale_tensor = cast(torch.Tensor, scale)
+                if scale_tensor.ndim == 1:
+                    scale_tensor = scale_tensor.reshape(-1, 1)
+                    zp = zp.reshape(-1, 1)
+                    scale = scale_tensor
+
                 assert (
                     scale_tensor.ndim == 2
                 ), "Weight scale must be 2D for per_channel_group [de]quant node, got {scale.ndim}D"
@@ -204,23 +205,23 @@ def _get_tensor(node):
 
         check_or_raise(
             bool(
-                quant_node.args[-1] != torch.uint8
-                or quant_node.args[-1] != torch.quint8
+                quant_node_args[-1] != torch.uint8
+                or quant_node_args[-1] != torch.quint8
             ),
             "XNNPACK does not support unsigned quantization",
         )
 
         if _groupwise:
-            _ = quant_node.args[-1]  # output dtype - not used
-            group_size = cast(int, quant_node.args[-2])
-            dtype = cast(torch.dtype, quant_node.args[-3])
-            qmax = cast(int, quant_node.args[-4])
-            qmin = cast(int, quant_node.args[-5])
+            _ = quant_node_args[-1]  # output dtype - not used
+            group_size = cast(int, quant_node_args[-2])
+            dtype = cast(torch.dtype, quant_node_args[-3])
+            qmax = cast(int, quant_node_args[-4])
+            qmin = cast(int, quant_node_args[-5])
         else:
             group_size = 0
-            dtype = cast(torch.dtype, quant_node.args[-1])
-            qmax = cast(int, quant_node.args[-2])
-            qmin = cast(int, quant_node.args[-3])
+            dtype = cast(torch.dtype, quant_node_args[-1])
+            qmax = cast(int, quant_node_args[-2])
+            qmin = cast(int, quant_node_args[-3])
 
         is_output = any(
             user_node.op == "output" for user_node in quant_node.users.keys()
@@ -244,26 +245,14 @@ def _get_tensor(node):
     def from_weights(
         cls, tensor_node: torch.fx.Node, ep: Optional[ExportedProgram] = None
     ) -> Optional[QuantParams]:
-        # Ignore transpose for weights
-        # TODO:T148540997 remove the t_copy/permute_copy check when convert addmm to linear
-        dq = (
-            tensor_node.all_input_nodes[0]
-            if tensor_node.target
-            in (
-                exir_ops.edge.aten.permute_copy.default,
-                exir_ops.edge.aten.t_copy.default,
-            )
-            else tensor_node
-        )
-        # check input of t_copy/permute_copy is dequant
-        if not is_dequant(dq):
+        if not is_dequant(tensor_node):
             return None
 
         # source node for quant params
-        src = dq
+        src = tensor_node
 
         # is input of dq is q?
-        dq_input = dq.all_input_nodes[0]
+        dq_input = src.all_input_nodes[0]
         if is_quant(dq_input):
             src = dq_input
 
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
index 141ccf9802e..ed105dc1f53 100644
--- a/backends/xnnpack/partition/config/__init__.py
+++ b/backends/xnnpack/partition/config/__init__.py
@@ -53,6 +53,11 @@
     MaxDimConfig,
     PreluConfig,
 )
+from executorch.backends.xnnpack.partition.config.quant_affine_configs import (
+    ChooseQParamsAffineConfig,
+    DeQuantizeAffineConfig,
+    QuantizeAffineConfig,
+)
 from executorch.backends.xnnpack.partition.config.xnnpack_config import (
     XNNPartitionerConfig,
 )
@@ -98,4 +103,8 @@
     # Quant/Dequant Op Configs
     QuantizedPerTensorConfig,
     DeQuantizedPerTensorConfig,
+    # Quant Affine Configs to preserve decomp
+    QuantizeAffineConfig,
+    DeQuantizeAffineConfig,
+    ChooseQParamsAffineConfig,
 ]
diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py
index 3c4d446a6b4..cbcb14899d4 100644
--- a/backends/xnnpack/partition/config/gemm_configs.py
+++ b/backends/xnnpack/partition/config/gemm_configs.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
 from itertools import chain
 from typing import cast, List, Optional, Tuple
 
@@ -13,9 +14,12 @@
     XNNPartitionerConfig,
 )
 from executorch.backends.xnnpack.utils.quant_utils import (
+    extract_qdq_affine_op_args_for_decomposed_ops,
+    is_affine_qdq,
     is_dequant,
     is_dynamic_qdq,
     is_per_channel,
+    is_per_channel_group,
     is_qparam,
     is_quant,
 )
@@ -28,12 +32,16 @@
 from executorch.exir.backend.canonical_partitioners.config_partitioner import (
     format_target_name,
 )
+from executorch.exir.backend.utils import WhyNoPartition
 from torch.export import ExportedProgram
 from torch.fx.passes.utils.source_matcher_utils import (
     get_source_partitions,
     SourcePartition,
 )
 
+logger = logging.getLogger(__name__)
+why = WhyNoPartition(logger=logger)
+
 
 class GEMMConfig(XNNPartitionerConfig):
     """
@@ -44,8 +52,8 @@ class GEMMConfig(XNNPartitionerConfig):
     different ops
     """
 
-    def __init__(self, weight_idx, bias_idx, act_idx, fused_acts):
-        super().__init__()
+    def __init__(self, weight_idx, bias_idx, act_idx, fused_acts, **kwargs):
+        super().__init__(**kwargs)
         self.weight_idx = weight_idx
         self.bias_idx = bias_idx
         self.act_idx = act_idx
@@ -57,6 +65,8 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
             return False
 
         is_valid, _ = self.get_deps(node, ep)
+        if not is_valid:
+            why(node, "Failed to get valid dependent nodes.")
         return is_valid
 
     def get_node_and_deps(
@@ -131,7 +141,7 @@ def _get_weight_deps(
                 return False, []
             gemm_deps.append(weight)
 
-            if is_per_channel(dequant_node):
+            if is_per_channel(dequant_node) or is_per_channel_group(dequant_node):
                 if len(dequant_node.all_input_nodes) < 2:
                     # Expected channel quantized to have scale/zp nodes
                     return False, []
@@ -214,12 +224,15 @@ def _get_act_deps(
                 return (False, [])
 
             gemm_deps.append(q_input)
-            if not (is_node(q_input.args[1]) and is_node(q_input.args[2])):
+            q_input_args = q_input.args
+            if is_affine_qdq(q_input):
+                q_input_args = extract_qdq_affine_op_args_for_decomposed_ops(q_input)
+            if not (is_node(q_input_args[1]) and is_node(q_input_args[2])):
                 # expected to find getitem node from choose qparam
                 return (False, [])
 
-            getitem1 = get_input_node(q_input, 1)
-            getitem2 = get_input_node(q_input, 2)
+            getitem1 = q_input_args[1]
+            getitem2 = q_input_args[2]
 
             if not (is_getitem(getitem1) and is_getitem(getitem2)):
                 # expected getitem node from choose qparam
@@ -237,17 +250,28 @@ def _get_act_deps(
 class LinearConfig(GEMMConfig):
     target_name = "linear.default"
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__(
             weight_idx=1,
             bias_idx=2,
             act_idx=0,
             fused_acts=["relu.default", "hardtanh.default"],
+            **kwargs,
         )
 
     def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
         return torch.ops.aten.linear.default
 
+    def _get_weight_deps(
+        self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType
+    ) -> Tuple[bool, List[torch.fx.Node]]:
+        if precision == ConfigPrecisionType.FP32 and self.force_fp32_dynamic_linear:
+            # if force fp32_dynamic_linear is on and we detected this as fp32, then we
+            # do not partition the weight node
+            return (True, [])
+
+        return super()._get_weight_deps(node, ep, precision)
+
     def supported_precision_types(self):
         return [
             ConfigPrecisionType.DYNAMIC_QUANT,
@@ -259,12 +283,13 @@ def supported_precision_types(self):
 class ConvolutionConfig(GEMMConfig):
     target_name = "convolution.default"
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__(
             weight_idx=1,
             bias_idx=2,
             act_idx=0,
             fused_acts=["relu.default", "hardtanh.default"],
+            **kwargs,
         )
 
     def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
@@ -276,10 +301,12 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
 
         conv_stride = cast(List[int], node.args[3])
         if len(conv_stride) > 2:
+            why(node, "Only support 1D + 2D Conv")
             return False  # Only support 1D + 2D Conv
 
         transposed = cast(bool, node.args[6])
         if transposed:
+            why(node, "Transposed Conv is not supported")
             return False  # Currently don't support transposed conv
 
         return True
@@ -299,12 +326,13 @@ class AddmmConfig(GEMMConfig):
 
     target_name = "addmm.default"
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__(
             weight_idx=2,
             bias_idx=0,
             act_idx=1,
             fused_acts=["relu.default", "hardtanh.default"],
+            **kwargs,
         )
         self.src_partitions = None
         self.linear_modules = [torch.nn.functional.linear, torch.nn.Linear]
@@ -402,8 +430,8 @@ def supported_precision_types(self):
 class MMConfig(AddmmConfig):
     target_name = "mm.default"
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
         self.bias_idx = None
         self.weight_idx = 1
         self.act_idx = 0
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
index e309a3bd038..b95d7c5b89c 100644
--- a/backends/xnnpack/partition/config/generic_node_configs.py
+++ b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
 from typing import cast, List, Optional
 
 import torch
@@ -16,17 +17,21 @@
 from executorch.exir.backend.canonical_partitioners.config_partitioner import (
     format_target_name,
 )
+from executorch.exir.backend.utils import WhyNoPartition
 from torch.export import ExportedProgram
 
+logger = logging.getLogger(__name__)
+why = WhyNoPartition(logger=logger)
+
 
 class GenericNodePartitionerConfig(XNNPartitionerConfig):
-    def __init__(self, fused_act: Optional[List[str]] = None):
+    def __init__(self, fused_act: Optional[List[str]] = None, **kwargs):
         """
         fused_act is a list of node target names that can be fused with this
         node under quantization
         """
         self.fused_acts = fused_act or []
-        super().__init__()
+        super().__init__(**kwargs)
 
     def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
         return self.check_common_constraints(node, ep)
@@ -93,8 +98,8 @@ def supported_precision_types(self) -> List[ConfigPrecisionType]:
 class AddConfig(GenericNodePartitionerConfig):
     target_name = "add.Tensor"
 
-    def __init__(self):
-        super().__init__(fused_act=["relu.default"])
+    def __init__(self, **kwargs):
+        super().__init__(fused_act=["relu.default"], **kwargs)
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT]
@@ -141,9 +146,22 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
         if len(args) >= 7:
             divisor_override = cast(int, args[6])
 
-        return (
-            not (ceil_mode or count_include_pad) and divisor_override == pooling_region
-        )
+        if ceil_mode:
+            why(node, reason="ceil mode is not supported")
+            return False
+
+        if count_include_pad:
+            why(
+                node,
+                reason="zero-padding in the averaging calculation is not supported",
+            )
+            return False
+
+        if divisor_override != pooling_region:
+            why(node, reason="divisor override is not supported")
+            return False
+
+        return True
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
@@ -160,7 +178,15 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
             return False
 
         num_tensors = len(node.all_input_nodes)
-        return num_tensors >= 2 and num_tensors <= 4
+
+        if not (num_tensors >= 2 and num_tensors <= 4):
+            why(
+                node,
+                reason=f"only support concatenation of 2 - 4 tensors, got {num_tensors} tensors",
+            )
+            return False
+
+        return True
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT]
@@ -210,7 +236,14 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
         dim = cast(int, node.args[1])
         node_input = node.all_input_nodes[0]
         tensor_dims = node_input.meta["val"].dim()
-        return dim == -1 or dim == tensor_dims - 1
+
+        if not (dim == -1 or dim == tensor_dims - 1):
+            why(
+                node,
+                reason=f"dim must be the last dim, got dim = {dim} for tensor of rank {tensor_dims}",
+            )
+            return False
+        return True
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
@@ -255,7 +288,10 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
             return False
 
         is_ceil_mode = len(node.args) >= 6 and cast(bool, node.args[5])
-        return not is_ceil_mode
+        if is_ceil_mode:
+            why(node, reason="ceil mode is not supported")
+            return False
+        return True
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT]
@@ -309,7 +345,20 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
         dims = node.args[1]
         output_dims = node.meta["val"].dim()
 
-        return dims in ([-2, -1], [-1, -2]) and output_dims == 4
+        if dims not in ([-2, -1], [-1, -2]):
+            why(
+                node,
+                reason="mean.dim only supports averaging 4D tensors across the innermost dimensions",
+            )
+            return False
+
+        if output_dims != 4:
+            why(
+                node,
+                reason=f"mean.dim only supports averaging 4D tensors, got tensor of rank {output_dims}",
+            )
+            return False
+        return True
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT]
@@ -340,7 +389,15 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
             return False
 
         power = node.args[1]
-        return isinstance(power, int) and power == 2
+
+        if not isinstance(power, int):
+            why(node, reason=f"only support int powers, got {power}")
+            return False
+
+        if power != 2:
+            why(node, reason=f"only support power == 2, got {power}")
+            return False
+        return True
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
@@ -372,10 +429,18 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
 
         for dim in input_shape:
             if not isinstance(dim, int) or dim == 0:
+                why(
+                    node,
+                    reason=f"input tensor has invalid shape, dim: {dim} of type {type(dim)}. Expecting non-zero, int values.",
+                )
                 return False
 
         for dim in output_shape:
             if not isinstance(dim, int) or dim == 0:
+                why(
+                    node,
+                    reason=f"output tensor has invalid shape, dim: {dim} of type {type(dim)}. Expecting non-zero, int values.",
+                )
                 return False
 
         return True
@@ -431,7 +496,14 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
             return False
         mask_node = node.all_input_nodes[3]
         mask_rank = mask_node.meta["val"].dim()
-        return mask_rank == 2
+        if mask_rank != 2:
+            why(
+                node,
+                reason=f"mask must have rank 2, got mask of rank {mask_rank}",
+            )
+            return False
+
+        return True
 
     def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
         return torch.ops.aten.scaled_dot_product_attention.default
diff --git a/backends/xnnpack/partition/config/node_configs.py b/backends/xnnpack/partition/config/node_configs.py
index 501216eaae3..2449d9d6440 100644
--- a/backends/xnnpack/partition/config/node_configs.py
+++ b/backends/xnnpack/partition/config/node_configs.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
 import operator
 from typing import List, Optional
 
@@ -19,8 +20,12 @@
 from executorch.exir.backend.canonical_partitioners.config_partitioner import (
     format_target_name,
 )
+from executorch.exir.backend.utils import WhyNoPartition
 from torch.export import ExportedProgram
 
+logger = logging.getLogger(__name__)
+why = WhyNoPartition(logger=logger)
+
 
 class BatchNormConfig(XNNPartitionerConfig):
     target_name = "_native_batch_norm_legit_no_training.default"
@@ -38,9 +43,15 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
         conv_name = format_target_name(conv.target.__name__)  # pyre-ignore
 
         if conv_name not in ["convolution.default"]:
+            why(node, f"Invalid conv target {conv_name}")
+            return False
+
+        can_fuse = FuseBatchNormWithConvPass.can_fuse(conv, bn, ep)
+        if not can_fuse:
+            why(node, "BatchNorm cannot be fused with Convolution")
             return False
 
-        return FuseBatchNormWithConvPass.can_fuse(conv, bn, ep)
+        return True
 
     def get_node_and_deps(
         self, node: torch.fx.Node, ep: ExportedProgram
@@ -74,17 +85,25 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
         supported_dtypes = {torch.float32, torch.float16, torch.int8, torch.qint8}
         node_val = node.meta.get("val")
         output_0 = node_val[0]
+
+        input_node = node.all_input_nodes[0]
+        if len(input_node.meta.get("val").shape) != 4:
+            why(node, f"Unsupported input rank {input_node.meta.get('val').shape}")
+            return False
         # Don't check indicies dtype
         if output_0.dtype not in supported_dtypes:
+            why(node, f"Unsupported output dtype {output_0.dtype}")
             return False
 
         max_input = node.all_input_nodes[0]
         if max_input.meta.get("val").dtype not in supported_dtypes:
+            why(node, f"Unsupported input dtype {max_input.meta.get('val').dtype}")
             return False
 
         # Make sure that all users are getitems of the first output
         for user in node.users:
             if not (user.target == operator.getitem and user.args[1] == 0):
+                why(node, "Unsupported user of max.dim")
                 return False
 
         return True
@@ -111,7 +130,11 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
             return False
 
         weight = node.all_input_nodes[1]
-        return is_param_node(ep, weight)
+        is_param = is_param_node(ep, weight)
+        if not is_param:
+            why(node, "Prelu weight must be a parameter")
+            return False
+        return True
 
     def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
         return torch.ops.aten.prelu.default
diff --git a/backends/xnnpack/partition/config/quant_affine_configs.py b/backends/xnnpack/partition/config/quant_affine_configs.py
new file mode 100644
index 00000000000..d9e789104b6
--- /dev/null
+++ b/backends/xnnpack/partition/config/quant_affine_configs.py
@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional
+
+import torch
+from executorch.backends.xnnpack.partition.config.xnnpack_config import (
+    ConfigPrecisionType,
+    XNNPartitionerConfig,
+)
+from torch.export import ExportedProgram
+
+
+class QDQAffineConfigs(XNNPartitionerConfig):
+    def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
+        return True
+
+    def get_node_and_deps(
+        self, node: torch.fx.Node, ep: ExportedProgram
+    ) -> List[torch.fx.Node]:
+        # Do not return anything from this because we only use this to
+        # preserve the decomposition
+        return []
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.DYNAMIC_QUANT]
+
+
+class QuantizeAffineConfig(QDQAffineConfigs):
+    target_name = "quantize_affine.default"
+
+    def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
+        try:
+            import torchao.quantization.quant_primitives  # noqa
+
+            return torch.ops.quant.quantize_affine.default
+        except:
+            return None
+
+
+class DeQuantizeAffineConfig(QDQAffineConfigs):
+    target_name = "dequantize_affine.default"
+
+    def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
+        try:
+            import torchao.quantization.quant_primitives  # noqa
+
+            return torch.ops.quant.dequantize_affine.default
+        except:
+            return None
+
+
+class ChooseQParamsAffineConfig(QDQAffineConfigs):
+    target_name = "choose_qparams_affine.default"
+
+    def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
+        try:
+            import torchao.quantization.quant_primitives  # noqa
+
+            return torch.ops.quant.choose_qparams_affine.default
+        except:
+            return None
diff --git a/backends/xnnpack/partition/config/xnnpack_config.py b/backends/xnnpack/partition/config/xnnpack_config.py
index 840ffbd43b4..d261416a76f 100644
--- a/backends/xnnpack/partition/config/xnnpack_config.py
+++ b/backends/xnnpack/partition/config/xnnpack_config.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
 from abc import abstractmethod
 from enum import Enum
 from typing import List, Optional
@@ -13,8 +14,12 @@
     format_target_name,
     PartitionerConfig,
 )
+from executorch.exir.backend.utils import WhyNoPartition
 from torch.export import ExportedProgram
 
+logger = logging.getLogger(__name__)
+why = WhyNoPartition(logger=logger)
+
 
 class ConfigPrecisionType(Enum):
     FP32 = 1
@@ -22,7 +27,6 @@ class ConfigPrecisionType(Enum):
     DYNAMIC_QUANT = 3
 
 
-# TODO: add WhyNotPartition to XNNPartitionerConfig
 class XNNPartitionerConfig(PartitionerConfig):
     """
     Base partitioner config for XNNPACK Partitioner Configs. Base wrapper class
@@ -33,9 +37,11 @@ class XNNPartitionerConfig(PartitionerConfig):
     types they want to enable
     """
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__()
         self.enabled_precision_types = self.supported_precision_types()
+        # Flag used in GEMMConfig()
+        self.force_fp32_dynamic_linear = kwargs.get("force_fp32_dynamic_linear", False)
 
     def get_partition(
         self, node: torch.fx.Node, ep: ExportedProgram
@@ -125,10 +131,12 @@ def check_common_constraints(
         )
 
         if len(self.enabled_precision_types) == 0:
+            why(node, reason="not enabled precision types")
             return False
 
         has_valid_dtypes = self._check_node_has_valid_dtype(node)
         if not has_valid_dtypes:
+            why(node, reason="invalid dtype")
             return False
 
         return True
diff --git a/backends/xnnpack/partition/xnnpack_partitioner.py b/backends/xnnpack/partition/xnnpack_partitioner.py
index f582ea753f4..700c7d1b753 100644
--- a/backends/xnnpack/partition/xnnpack_partitioner.py
+++ b/backends/xnnpack/partition/xnnpack_partitioner.py
@@ -5,6 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import itertools
+
+import logging
 from typing import List, Optional, Type, Union
 
 from executorch.backends.xnnpack.partition.config import ALL_PARTITIONER_CONFIGS
@@ -21,6 +23,9 @@
 from executorch.exir.backend.partitioner import DelegationSpec
 from torch.fx.passes.infra.partitioner import Partition
 
+logging.basicConfig(level=logging.WARNING)
+logger = logging.getLogger(__name__)
+
 
 class XnnpackPartitioner(ConfigerationBasedPartitioner):
     def __init__(
@@ -30,7 +35,17 @@ def __init__(
             Union[ConfigPrecisionType, List[ConfigPrecisionType]]
         ] = None,
         per_op_mode=False,
+        verbose: bool = False,
+        **kwargs,
     ):
+        """
+        @verbose: if True, print out more information about the partitioner.
+            Default level is WARNING. If verbose is True, level is set to DEBUG.
+        """
+        if verbose:
+            logger.setLevel(logging.DEBUG)
+            logger.debug("Verbose logging enabled for XNNPACK partitioner.")
+
         delegation_spec = DelegationSpec(XnnpackBackend.__name__, [])
         configs_to_use = configs or ALL_PARTITIONER_CONFIGS
         # Can do logic and have extra args to filter/delete/select
@@ -41,7 +56,7 @@ def __init__(
 
         for config in configs_to_use:
             # Config Classes given to XnnpackPartitioner should no longer be abstract
-            initialized = config()  #  pyre-ignore
+            initialized = config(**kwargs)  #  pyre-ignore
             initialized.set_enabled_precision_types(config_precisions)
             initialized_configs.append(initialized)
 
diff --git a/backends/xnnpack/passes/TARGETS b/backends/xnnpack/passes/TARGETS
index e91614c735b..6bc3742abe6 100644
--- a/backends/xnnpack/passes/TARGETS
+++ b/backends/xnnpack/passes/TARGETS
@@ -30,6 +30,7 @@ python_library(
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
         "//executorch/exir/passes:const_prop_pass",
+        "//executorch/exir/passes:memory_format_ops_pass",
         "//executorch/exir/program:program",
     ],
 )
diff --git a/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py
index f1f9a69acca..692f1a9d145 100644
--- a/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py
@@ -124,7 +124,7 @@ def create_call_function_node(
             "call_function",
             target=target,
             args=args,
-            kwargs=(
+            kwargs=(  # pyre-fixme[6]
                 {"memory_format": memory_format} if memory_format is not None else {}
             ),
         )
diff --git a/backends/xnnpack/passes/convert_to_linear.py b/backends/xnnpack/passes/convert_to_linear.py
index 69f882523c8..2cef71bf927 100644
--- a/backends/xnnpack/passes/convert_to_linear.py
+++ b/backends/xnnpack/passes/convert_to_linear.py
@@ -13,9 +13,8 @@
 from executorch.backends.transforms.addmm_mm_to_linear import (
     apply_addmm_mm_to_linear_transform,
 )
-from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass
-from executorch.backends.xnnpack.utils.utils import is_param_node
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 from torch.fx.passes.infra.pass_base import PassResult
 from torch.fx.passes.utils.source_matcher_utils import (
@@ -27,7 +26,7 @@
 logger.setLevel(logging.WARNING)
 
 
-class ConvertToLinearPass(XNNPACKPass):
+class ConvertToLinearPass(ExportPass):
     linear_modules = [
         torch.nn.Linear,
         torch.nn.functional.linear,
@@ -71,28 +70,24 @@ def get_arg(node: torch.fx.Node, arg: str):
             map_ = {"input": 0, "weight": 1}
             return None if arg == "bias" else node.args[map_[arg]]
 
-    def find_bias_for_mm(self, src_partition: SourcePartition, weight: torch.fx.Node):
+    def find_bias_for_mm(self, src_partition: SourcePartition, mm_node: torch.fx.Node):
         """
         For linear decomposed with mm + add, find bias in src partition
         """
-        out_channels = get_shape(weight)[0]
-        bias = None
-
-        # Try to find bias node in all nodes
-        for node in src_partition.nodes:
-            if is_param_node(self.exported_program, node) and node != weight:
-                bias = node
-
-        if bias is not None:
-            assert get_shape(bias) == [
-                out_channels
-            ], f"Expected bias shape {[out_channels]} but got {get_shape(bias)}"
-        else:
-            assert exir_ops.edge.aten.add.Tensor not in [
-                node.target for node in src_partition.nodes
-            ], f"Expecting to find bias for Linear module: {src_partition} but could not find it"
 
-        return bias
+        mm_users = list(mm_node.users.keys())
+        if len(mm_users) != 1:
+            return None
+
+        add_node = mm_users[0]
+        if add_node.target != exir_ops.edge.aten.add.Tensor:
+            return None
+
+        for arg in add_node.all_input_nodes:
+            if arg != mm_node and arg in src_partition.input_nodes:
+                return arg
+
+        return None
 
     def create_linear(
         self,
@@ -119,7 +114,7 @@ def create_linear(
             src_partition.input_nodes + src_partition.params,  # bias can be in params
         )
         if linear_bias is None and node.target == exir_ops.edge.aten.mm.default:
-            linear_bias = self.find_bias_for_mm(src_partition, linear_weight)
+            linear_bias = self.find_bias_for_mm(src_partition, node)
 
         logger.debug(f"Found bias(?): {linear_bias} from node {node}")
 
diff --git a/backends/xnnpack/passes/convert_to_sdpa.py b/backends/xnnpack/passes/convert_to_sdpa.py
index 76bb24cc949..97aca5491dd 100644
--- a/backends/xnnpack/passes/convert_to_sdpa.py
+++ b/backends/xnnpack/passes/convert_to_sdpa.py
@@ -83,7 +83,7 @@ def create_sdpa(
                 kwargs={"scale": scale},
             )
 
-        sdpa_node.meta["val"] = sdpa_node.target(
+        sdpa_node.meta["val"] = sdpa_node.target(  # pyre-fixme[29]
             *[n.meta["val"] for n in match.placeholder_nodes],
             scale=scale,
         )
diff --git a/backends/xnnpack/passes/tag_implicit_q_dq_pass.py b/backends/xnnpack/passes/tag_implicit_q_dq_pass.py
index 0aa2e1291e3..ac6ccc9b89d 100644
--- a/backends/xnnpack/passes/tag_implicit_q_dq_pass.py
+++ b/backends/xnnpack/passes/tag_implicit_q_dq_pass.py
@@ -12,7 +12,11 @@
     SUPPORTED_IMPLICIT_Q_DQ_OP_NAMES_SET,
 )
 from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass
-from executorch.backends.xnnpack.utils.quant_utils import is_dequant, is_quant
+from executorch.backends.xnnpack.utils.quant_utils import (
+    is_dequant,
+    is_dynamic_qdq,
+    is_quant,
+)
 from executorch.backends.xnnpack.utils.utils import is_param_node
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
@@ -76,18 +80,7 @@ def is_output_node(self, node: torch.fx.Node) -> bool:
         return node.op == "output"
 
     def is_dynamically_quantized(self, node: torch.fx.Node) -> bool:
-        return any(
-            is_dequant(input_node)
-            and (
-                cast(
-                    torch._ops.OpOverload, input_node.target
-                )._schema.schema.overload_name
-                == "tensor"
-                or input_node.target
-                == exir_ops.edge.quantized_decomposed.dequantize_per_token.default
-            )
-            for input_node in node.all_input_nodes
-        )
+        return is_dynamic_qdq(node)
 
     def is_supported_quant_op(self, node: torch.fx.Node) -> bool:
         return (
@@ -191,7 +184,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 
             ending_implicit_q_nodes = []
             for user in first_node.users:
-                if self.is_dynamically_quantized(user):
+                if self.is_dynamically_quantized(first_node):
                     # if the dq is a dynamic dq, then it is implicit
                     break
                 user_end_nodes = self.get_ending_implicit_q_nodes(user)
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index ac53831b04c..2145ea15199 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -9,7 +9,7 @@
 #include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
 #include <executorch/backends/xnnpack/runtime/XNNHeader.h>
 #include <executorch/backends/xnnpack/serialization/schema_generated.h>
-#include <executorch/backends/xnnpack/threadpool/threadpool.h>
+#include <executorch/extension/threadpool/threadpool.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <unordered_map>
 
@@ -21,6 +21,25 @@ namespace executor {
 namespace xnnpack {
 namespace delegate {
 
+/*
+ * Provide compile-time allocation.
+ */
+class CompileAllocator {
+ public:
+  /*
+   * Allocate memory which will be automatically freed at the end
+   * of the compilation process.
+   */
+  void* allocateTemporary(size_t size) {
+    auto mem = new uint8_t[size];
+    temporaries_.emplace_back(mem);
+    return mem;
+  }
+
+ private:
+  std::vector<std::unique_ptr<uint8_t[]>> temporaries_;
+};
+
 // Flatbuffer types
 using ValuePtr = const fb_xnnpack::XValue*;
 using NodePtr = const fb_xnnpack::XNode*;
@@ -35,6 +54,23 @@ using DefineNodeFunc = Error (*)(
     const std::unordered_map<uint32_t, uint32_t>&,
     NodePtr) noexcept;
 
+/*
+Convert a tensor from fp32 to bf16.
+*/
+void convertF32TensorToBF16(
+    const float* f32_data,
+    uint16_t* bf16_data_out,
+    size_t numel) {
+  for (auto i = 0u; i < numel; i++) {
+    // Adjust the f32 value such that it rounds properly after truncation.
+    // Constant factor scales 1+2^-8 to 1+2e-7.
+    float f32_adjusted = f32_data[i] * 1.00389105f;
+    uint32_t f32_bits;
+    memcpy(&f32_bits, &f32_adjusted, sizeof(float));
+    bf16_data_out[i] = static_cast<uint16_t>(f32_bits >> 16);
+  }
+}
+
 /*
 Gets the output min and output max for a given node operator
 */
@@ -152,7 +188,8 @@ Error defineTensor(
     GraphPtr flatbuffer_graph,
     const uint8_t* constant_data_ptr,
     std::vector<uint32_t>& input_ids,
-    std::vector<uint32_t>& output_ids) {
+    std::vector<uint32_t>& output_ids,
+    CompileAllocator& allocator) {
   const fb_xnnpack::XNNTensorValue* tensor_value = nullptr;
   const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr;
 
@@ -356,12 +393,31 @@ Error defineTensor(
         size_t group_size = qparams->group_size();
         size_t output_channels = tensor_value->dims()->Get(0);
         size_t input_channels = tensor_value->dims()->Get(1);
+
+        const uint16_t* scale_data = nullptr;
+        uint32_t scale_numel = 0;
+
+        // Block scales are preferably serialized as bf16 but can also be
+        // serialized as fp32 for backwards compatability.
+        if (qparams->scale_bf16() != nullptr) {
+          scale_data =
+              static_cast<const uint16_t*>(qparams->scale_bf16()->data());
+          scale_numel = qparams->scale_bf16()->size();
+        } else {
+          // Read fp32 scales, convert to bf16.
+          auto conv_buffer = static_cast<uint16_t*>(allocator.allocateTemporary(
+              qparams->scale()->size() * sizeof(uint16_t)));
+          scale_numel = qparams->scale()->size();
+          convertF32TensorToBF16(
+              qparams->scale()->data(), conv_buffer, scale_numel);
+          scale_data = conv_buffer;
+        }
+
         ET_CHECK_OR_RETURN_ERROR(
-            qparams->scale()->size() ==
-                output_channels * input_channels / group_size,
+            scale_numel == output_channels * input_channels / group_size,
             Internal,
             "scale size %zu != output channels %zu * group size %zu",
-            (size_t)qparams->scale()->size(),
+            static_cast<size_t>(scale_numel),
             output_channels,
             group_size);
         int32_t zero_point =
@@ -370,18 +426,19 @@ Error defineTensor(
             Debug,
             "define quant tensor (per channel group): buffer_ptr: %p, scale.numel(): %u, channel_dim: %u, grpup_size: %zu, output_channels: %zu, dtype: %u, zero_point: %d, datatype: %d\n",
             buffer_ptr,
-            qparams->scale()->size(),
+            scale_numel,
             qparams->channel_dim(),
             group_size,
             output_channels,
             datatype,
             zero_point,
             datatype);
+
         status = xnn_define_blockwise_quantized_tensor_value(
             /*subgraph=*/subgraph_ptr,
             /*datatype=*/datatype,
             /*zero_point=*/zero_point,
-            /*scale=*/qparams->scale()->data(),
+            /*scale=*/scale_data,
             /*num_dims=*/tensor_value->num_dims(),
             /*channel_dim=*/qparams->channel_dim(),
             /*block_size=*/qparams->group_size(),
@@ -1617,6 +1674,7 @@ ET_NODISCARD Error XNNCompiler::compileModel(
   Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes);
   const uint8_t* flatbuffer_data = nullptr;
   const uint8_t* constant_data = nullptr;
+  CompileAllocator compile_allocator;
 
   // Header status can only either be Error::Ok or Error::NotFound
   if (header.ok()) {
@@ -1688,7 +1746,8 @@ ET_NODISCARD Error XNNCompiler::compileModel(
         flatbuffer_graph,
         constant_data,
         input_ids,
-        output_ids);
+        output_ids,
+        compile_allocator);
 
     if (err != Error::Ok) {
       return err;
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
index 264dc838720..c817c010e29 100644
--- a/backends/xnnpack/runtime/XNNPACKBackend.cpp
+++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -20,7 +20,7 @@
 namespace torch {
 namespace executor {
 
-class XnnpackBackend final : public PyTorchBackendInterface {
+class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
  public:
   ~XnnpackBackend() = default;
 
diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs
index f32e7c60637..efe717e085e 100644
--- a/backends/xnnpack/serialization/runtime_schema.fbs
+++ b/backends/xnnpack/serialization/runtime_schema.fbs
@@ -63,6 +63,7 @@ table PerChannelGroupQuant {
   scale:[float];
   channel_dim:int;
   group_size:int;
+  scale_bf16:[ushort];
 }
 
 table XNNTensorValue {
diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs
index 773a459bbf6..33571195d63 100644
--- a/backends/xnnpack/serialization/schema.fbs
+++ b/backends/xnnpack/serialization/schema.fbs
@@ -48,6 +48,7 @@ table PerChannelGroupQuant {
   scale:[float];
   channel_dim:int;
   group_size:int;
+  scale_bf16:[ushort];
 }
 
 table PerChannelQuant {
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
index 4fd0ee519cb..633808dcfe5 100644
--- a/backends/xnnpack/targets.bzl
+++ b/backends/xnnpack/targets.bzl
@@ -36,10 +36,10 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
         preprocessor_flags = [
+            # Enable workspace sharing across delegates
+            "-DENABLE_XNNPACK_SHARED_WORKSPACE",
             # Uncomment to enable per operator timings
             # "-DENABLE_XNNPACK_PROFILING",
-            # Uncomment to enable workspace sharing across delegates
-            # "-DENABLE_XNNPACK_SHARED_WORKSPACE"
         ],
         exported_deps = [
             "//executorch/runtime/backend:interface",
@@ -47,7 +47,7 @@ def define_common_targets():
         deps = [
             third_party_dep("XNNPACK"),
             "//executorch/backends/xnnpack/serialization:xnnpack_flatbuffer_header",
-            "//executorch/backends/xnnpack/threadpool:threadpool",
+            "//executorch/extension/threadpool:threadpool",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
         ],
         # XnnpackBackend.cpp needs to compile with executor as whole
diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt
index d0fbddae237..02852871fe0 100644
--- a/backends/xnnpack/test/CMakeLists.txt
+++ b/backends/xnnpack/test/CMakeLists.txt
@@ -23,8 +23,10 @@ include(${EXECUTORCH_ROOT}/build/Test.cmake)
 
 set(_test_srcs # We can't put runtime/test_runtime_utils.cpp because we don't
                # build aten
-    runtime/test_xnnexecutor.cpp ../threadpool/threadpool.cpp
-    ../threadpool/threadpool_guard.cpp ../threadpool/test/threadpool_test.cpp
+    runtime/test_xnnexecutor.cpp
+    ${EXECUTORCH_ROOT}/extension/threadpool/threadpool.cpp
+    ${EXECUTORCH_ROOT}/extension/threadpool/threadpool_guard.cpp
+    ${EXECUTORCH_ROOT}/extension/threadpool/test/threadpool_test.cpp
 )
 
 et_cxx_test(
@@ -32,6 +34,7 @@ et_cxx_test(
   SOURCES
   ${_test_srcs}
   EXTRA_LIBS
+  extension_threadpool
   xnnpack_backend
   XNNPACK
   pthreadpool
diff --git a/backends/xnnpack/test/TARGETS b/backends/xnnpack/test/TARGETS
index abedffb8e61..629ac8275bc 100644
--- a/backends/xnnpack/test/TARGETS
+++ b/backends/xnnpack/test/TARGETS
@@ -36,10 +36,10 @@ runtime.python_test(
     deps = [
         "//executorch/backends/xnnpack/partition:xnnpack_partitioner",
         "//executorch/backends/xnnpack/test/tester:tester",
+        "//executorch/devtools:lib",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program/serialize:lib",
         "//executorch/exir/passes:constant_prop_pass",
-        "//executorch/sdk:lib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program/serialize:lib",
         "//pytorch/ao:torchao",  # @manual
     ],
     external_deps = [
diff --git a/backends/xnnpack/test/ops/linear.py b/backends/xnnpack/test/ops/linear.py
index a9459050e79..d8de79f283d 100644
--- a/backends/xnnpack/test/ops/linear.py
+++ b/backends/xnnpack/test/ops/linear.py
@@ -26,8 +26,167 @@
 )
 from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import QuantizationConfig
 
+try:
+    from torchao.quantization.quant_api import (
+        int8_dynamic_activation_int4_weight,
+        quantize_,
+        unwrap_tensor_subclass,
+    )
+
+    torchao_installed = True
+except:
+    torchao_installed = False
+
+
+# Pytorch Modules Used for Testing
+class BaseLinear(torch.nn.Module):
+    def __init__(
+        self,
+        in_size: int = 2,
+        input_channels: int = 4,
+        output_channels: int = 4,
+        dtype: torch.dtype = torch.float,
+        use_bias: bool = False,
+    ):
+        super().__init__()
+        self.linear = torch.nn.Linear(
+            input_channels, output_channels, bias=use_bias
+        ).to(dtype=dtype)
+
+        self.ic = input_channels
+        self.oc = output_channels
+
+        assert dtype in [torch.float, torch.half], "Unsupported op dtype"
+        self.op_dtype = dtype
+        self.in_size = in_size
+
+    def forward(self, x):
+        return self.linear(x)
+
+    def get_inputs(self):
+        return (torch.randn(1, self.in_size, self.ic).to(self.op_dtype),)
+
+
+class AddMMModule(torch.nn.Module):
+    def __init__(self, in_size, out_size):
+        super().__init__()
+        self.mat = torch.nn.Parameter(torch.randn(in_size, out_size))
+        self.bias = torch.nn.Parameter(torch.randn(1, out_size))
+
+    def forward(self, x):
+        return torch.addmm(self.bias, x, self.mat)
+
+
+class LinearReluModule(torch.nn.Module):
+    def __init__(self, in_size, out_size, use_bias, dtype=torch.float):
+        super().__init__()
+        self.dtype = dtype
+        self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias).to(dtype=dtype)
+
+    def forward(self, x):
+        return torch.nn.functional.relu(self.linear(x))
+
+    def get_inputs(self):
+        return (torch.randn(1, self.in_size, self.ic).to(self.op_dtype),)
+
+
+class LinearParallelSequentialModule(torch.nn.Module):
+    def __init__(
+        self,
+        in_size=2,
+        input_size=4,
+        intermediate_size=5,
+        output_size=3,
+        dtype=torch.float,
+    ):
+        super().__init__()
+        self.linear1_weight = torch.nn.Parameter(
+            torch.rand(intermediate_size, input_size)
+        )
+        self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size))
+
+        self.linear2_weight = torch.nn.Parameter(
+            torch.rand(intermediate_size, input_size)
+        )
+        self.linear2_bias = torch.nn.Parameter(torch.rand(intermediate_size))
+
+        self.linear3_weight = torch.nn.Parameter(
+            torch.rand(output_size, intermediate_size)
+        )
+        self.linear3_bias = torch.nn.Parameter(torch.rand(output_size))
+        self.in_size = in_size
+        self.input_size = input_size
+        self.dtype = torch.float
+
+    def forward(self, x, y):
+        a = torch.nn.functional.linear(x, self.linear1_weight, self.linear1_bias)
+        b = torch.nn.functional.linear(y, self.linear2_weight, self.linear2_bias)
+        c = torch.nn.functional.linear(b, self.linear3_weight, self.linear3_bias)
+        return (a, c)
+
+    def get_inputs(self):
+        return (
+            torch.rand(self.in_size, self.input_size, dtype=self.dtype),
+            torch.rand(self.in_size, self.input_size, dtype=self.dtype),
+        )
+
+
+class LinearSequential(torch.nn.Module):
+    def __init__(
+        self,
+        in_size=2,
+        input_size=4,
+        intermediate_size=5,
+        output_size=3,
+        dtype=torch.float,
+    ):
+        super().__init__()
+        self.linear1_weight = torch.nn.Parameter(
+            torch.rand(intermediate_size, input_size)
+        )
+        self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size))
+
+        self.linear2_weight = torch.nn.Parameter(
+            torch.rand(output_size, intermediate_size)
+        )
+        self.linear2_bias = torch.nn.Parameter(torch.rand(output_size))
+        self.in_size = in_size
+        self.input_size = input_size
+        self.dtype = torch.float
+
+    def forward(self, x):
+        a = torch.nn.functional.linear(x, self.linear1_weight, self.linear1_bias)
+        b = torch.nn.functional.linear(a, self.linear2_weight, self.linear2_bias)
+        return b
+
+    def get_inputs(self):
+        return (torch.rand(self.in_size, self.input_size, dtype=torch.float),)
+
 
 class TestLinear(unittest.TestCase):
+    """
+    Test Class for XNNPACK Linear Operators.
+
+    Notes:
+        - XNNPACK Does not support Per Tensor Quantized Weights with Dynamic Activations
+        - XNNPACK Only supports Per-Token Activation, so Dynamic per-tensor Quantization
+          As done by the default dynamic quantization flow does Per-Token Quantization
+          Activation under the hood, where the torch.nn.Module is doing Per-Tensor Quantization
+          on the Activation. This is sufficient because Per-Token Quantization on Activations
+          should produce strictly better results compared to Per-Tensor Quantization
+    """
+
+    @staticmethod
+    def _get_4b_dqconfig() -> QuantizationConfig:
+        # Returns a QuantizationConfig for 4b dynamic quantization for XNNPACK.
+        qconfig: QuantizationConfig = get_symmetric_quantization_config(
+            is_per_channel=True,
+            is_dynamic=True,
+            weight_qmin=-8,
+            weight_qmax=7,
+        )
+        return qconfig
+
     def test_fp16_linear(self):
         for use_bias in (True, False):
             for num_batch_dims in range(1, 3):
@@ -65,33 +224,13 @@ def test_qc8_linear(self):
                 )
 
     def test_fp32_addmm(self):
-        """
-        Note that the ConvertToLinear pass requires the weight matrix to be transposed.
-        """
-
-        class AddMMModule(torch.nn.Module):
-            def __init__(self, in_size, out_size):
-                super().__init__()
-                self.mat = torch.nn.Parameter(torch.randn(in_size, out_size))
-                self.bias = torch.nn.Parameter(torch.randn(1, out_size))
-
-            def forward(self, x):
-                return torch.addmm(self.bias, x, self.mat)
-
+        # Note that the ConvertToLinear pass requires the weight matrix to be transposed.
         self._test_linear(
             lambda in_size, out_size: AddMMModule(in_size, out_size),
             uses_bias=True,
         )
 
     def test_fp32_linear_fused_relu(self):
-        class LinearReluModule(torch.nn.Module):
-            def __init__(self, in_size, out_size, use_bias):
-                super().__init__()
-                self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias)
-
-            def forward(self, x):
-                return torch.nn.functional.relu(self.linear(x))
-
         for use_bias in (True, False):
             for num_batch_dims in range(1, 3):
                 self._test_linear(
@@ -105,14 +244,6 @@ def forward(self, x):
                 )
 
     def test_qs8_linear_fused_relu(self):
-        class LinearReluModule(torch.nn.Module):
-            def __init__(self, in_size, out_size, use_bias):
-                super().__init__()
-                self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias)
-
-            def forward(self, x):
-                return torch.nn.functional.relu(self.linear(x))
-
         for use_bias in (True, False):
             for num_batch_dims in range(1, 3):
                 self._test_linear(
@@ -138,21 +269,6 @@ def test_qs8_linear(self):
                     quant_type="per_tensor",
                 )
 
-    @unittest.skip("XNNPACK currently only supports per-channel dynamic quantization.")
-    def _test_qd8_per_tensor_linear(self):
-        for uses_bias in (False, True):
-            inputs = (torch.randn(2, 4),)
-            module = torch.nn.Linear(4, 5, bias=uses_bias)
-            dynamic_shapes = ({0: torch.export.Dim("batch", max=100)},)
-
-            self._test_dqlinear(
-                module,
-                inputs,
-                dynamic_shapes=dynamic_shapes,
-                is_per_channel=False,
-                uses_bias=uses_bias,
-            )
-
     def test_qd8_per_channel_linear(self):
         for uses_bias in (False, True):
             inputs = (torch.randn(2, 4),)
@@ -166,19 +282,6 @@ def test_qd8_per_channel_linear(self):
                 uses_bias=uses_bias,
             )
 
-    @staticmethod
-    def _get_4b_dqconfig() -> QuantizationConfig:
-        """
-        Returns a QuantizationConfig for 4b dynamic quantization for XNNPACK.
-        """
-        qconfig: QuantizationConfig = get_symmetric_quantization_config(
-            is_per_channel=True,
-            is_dynamic=True,
-            weight_qmin=-8,
-            weight_qmax=7,
-        )
-        return qconfig
-
     def test_qd8_per_channel_4w_linear(self):
         qconfig = self._get_4b_dqconfig()
         input_channels = [2, 63]
@@ -267,38 +370,12 @@ def test_qd8_per_channel_linear_with_two_batch(self):
         )
 
     def test_qd8_per_channel_linear_sequential(self):
-        in_size = 2
-        input_size = 4
-        intermediate_size = 5
-        output_size = 3
-
-        class LinearSequential(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear1_weight = torch.nn.Parameter(
-                    torch.rand(intermediate_size, input_size)
-                )
-                self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size))
-
-                self.linear2_weight = torch.nn.Parameter(
-                    torch.rand(output_size, intermediate_size)
-                )
-                self.linear2_bias = torch.nn.Parameter(torch.rand(output_size))
-
-            def forward(self, x):
-                a = torch.nn.functional.linear(
-                    x, self.linear1_weight, self.linear1_bias
-                )
-                b = torch.nn.functional.linear(
-                    a, self.linear2_weight, self.linear2_bias
-                )
-                return b
-
-        inputs = (torch.rand(in_size, input_size, dtype=torch.float),)
+        lin_mod = LinearSequential()
+        inputs = lin_mod.get_inputs()
         dynamic_shapes = ({0: torch.export.Dim("batch", max=100)},)
 
         self._test_dqlinear(
-            LinearSequential(),
+            lin_mod,
             inputs,
             dynamic_shapes=dynamic_shapes,
             linear_count=2,
@@ -307,53 +384,16 @@ def forward(self, x):
             atol=1e-1,
         )
 
-    def test_qd8_per_channel_linear_parellel_and_sequential(self):
-        in_size = 2
-        input_size = 4
-        intermediate_size = 5
-        output_size = 3
-
-        class LinearModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear1_weight = torch.nn.Parameter(
-                    torch.rand(intermediate_size, input_size)
-                )
-                self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size))
-
-                self.linear2_weight = torch.nn.Parameter(
-                    torch.rand(intermediate_size, input_size)
-                )
-                self.linear2_bias = torch.nn.Parameter(torch.rand(intermediate_size))
-
-                self.linear3_weight = torch.nn.Parameter(
-                    torch.rand(output_size, intermediate_size)
-                )
-                self.linear3_bias = torch.nn.Parameter(torch.rand(output_size))
-
-            def forward(self, x, y):
-                a = torch.nn.functional.linear(
-                    x, self.linear1_weight, self.linear1_bias
-                )
-                b = torch.nn.functional.linear(
-                    y, self.linear2_weight, self.linear2_bias
-                )
-                c = torch.nn.functional.linear(
-                    b, self.linear3_weight, self.linear3_bias
-                )
-                return (a, c)
-
-        inputs = (
-            torch.rand(in_size, input_size, dtype=torch.float),
-            torch.rand(in_size, input_size, dtype=torch.float),
-        )
+    def test_qd8_per_channel_linear_parallel_and_sequential(self):
+        lin_mod = LinearParallelSequentialModule()
+        inputs = lin_mod.get_inputs()
         dynamic_shapes = (
             {0: torch.export.Dim("batch", max=100)},
             {0: torch.export.Dim("batch2", max=100)},
         )
 
         self._test_dqlinear(
-            LinearModule(),
+            lin_mod,
             inputs,
             dynamic_shapes=dynamic_shapes,
             linear_count=3,
@@ -362,90 +402,59 @@ def forward(self, x, y):
             atol=1e-1,
         )
 
-    def test_qd8_fp32_per_token_weight_per_channel_int8(self):
-        self._run_manual_dqlinear_tests(8, torch.float)
-
-    def test_qd8_fp32_per_token_weight_per_channel_int4(self):
-        self._run_manual_dqlinear_tests(4, torch.float)
-
-    # This fails because the output tensor dtype is different, but if you squint and ignore that and look at the values,
-    # it is not too bad.
-    # Difference: max: 0.042601585388183594, abs: 0.042601585388183594.
-    # -- Model vs. Reference --
-    #  Numel: 68, 68
-    # Median: -0.7754800915718079, -0.7755751013755798
-    #   Mean: -0.6128872036933899, -0.6143574714660645
-    #    Max: 12.518657684326172, 12.516003608703613
-    #    Min: -20.070953369140625, -20.077701568603516
-    @unittest.skip("Need to fix the dq_per_channel output dtype")
-    def _test_qd8_fp16_per_token_weight_per_channel_int8(self):
-        self._run_manual_dqlinear_tests(8, torch.float16)
-
-    @unittest.skip("Need to fix the dq_per_channel output dtype")
-    def _test_qd8_fp16_per_token_weight_per_channel_int4(self):
-        self._run_manual_dqlinear_tests(4, torch.float16)
-
+    @unittest.skipIf(
+        not torchao_installed, "Per Channel Group Quantization Required TorchAO"
+    )
     def test_qd8_fp32_per_token_weight_per_channel_group_int4(self):
         M_sizes = [1, 2, 17, 31]
-        K_sizes = [8, 32, 64, 128]
-        bl_sizes = [8, 16, 16, 32]
+        K_sizes = [32, 32, 64, 128]
+        bl_sizes = [32, 32, 32, 64]
         N_sizes = [2, 17, 92, 128]
 
         for use_bias in [True, False]:
-            for i, _ in enumerate(M_sizes):
-                M = int(M_sizes[i])
-                K = int(K_sizes[i])
-                N = int(N_sizes[i])
-                bl = int(bl_sizes[i])
-                mod = self.ManualDQLinear(
+            for M, K, bl, N in zip(M_sizes, K_sizes, bl_sizes, N_sizes):
+                lin_mod = BaseLinear(
                     input_channels=K,
                     output_channels=N,
-                    weight_n_bit=4,
                     dtype=torch.float,
-                    group_size=bl,
-                    force_groupwise_quant=True,
                     use_bias=use_bias,
                 )
 
                 inputs = (torch.randn(1, M, K),)
-                self._test_manual_dq_linear(
-                    mod,
-                    inputs,
-                    weight_groupwise=True,
-                    use_bias=use_bias,
+                self._test_groupwise_dq_linear(
+                    lin_mod, inputs, group_size=bl, use_bias=use_bias
                 )
 
-    @unittest.skip("Need to fix the dq_per_channel_group output dtype")
-    def _test_qd8_fp16_per_token_weight_per_channel_group_int4(self):
+    @unittest.skipIf(
+        not torchao_installed, "Per Channel Group Quantization Required TorchAO"
+    )
+    def test_qd8_fp16_per_token_weight_per_channel_group_int4(self):
         M_sizes = [1, 2, 17, 31]
-        K_sizes = [8, 32, 64, 128]
-        bl_sizes = [8, 16, 16, 32]
+        K_sizes = [32, 32, 64, 128]
+        bl_sizes = [32, 32, 32, 64]
         N_sizes = [2, 17, 92, 128]
 
         for use_bias in [True, False]:
-            for i, _ in enumerate(M_sizes):
-                M = int(M_sizes[i])
-                K = int(K_sizes[i])
-                N = int(N_sizes[i])
-                bl = int(bl_sizes[i])
-                mod = self.ManualDQLinear(
+            for M, K, bl, N in zip(M_sizes, K_sizes, bl_sizes, N_sizes):
+                lin_mod = BaseLinear(
+                    in_size=M,
                     input_channels=K,
                     output_channels=N,
-                    weight_n_bit=4,
                     dtype=torch.float16,
-                    group_size=bl,
-                    force_groupwise_quant=True,
                     use_bias=use_bias,
                 )
 
-                inputs = (torch.randn(1, M, K, dtype=torch.float16),)
-                self._test_manual_dq_linear(
-                    mod,
-                    inputs,
-                    weight_groupwise=True,
-                    use_bias=use_bias,
-                    atol=0.1,
-                    rtol=0.1,
+                inputs = lin_mod.get_inputs()
+                # This requires slightly higher atol, but if you look at error it is not that bad:
+                # Difference: max: 0.00140380859375, abs: 0.00140380859375, mean abs error: 0.00042724609375.
+                # -- Model vs. Reference --
+                # Numel: 4, 4
+                # Median: -0.05023193359375, -0.0516357421875
+                # Mean: 0.2373046875, 0.237060546875
+                # Max: 1.0078125, 1.0078125
+                # Min: -0.08465576171875, -0.08441162109375
+                self._test_groupwise_dq_linear(
+                    lin_mod, inputs, group_size=bl, use_bias=use_bias, atol=1e-2
                 )
 
     def _test_linear(
@@ -467,7 +476,20 @@ def _test_linear(
         input_sizes = [4, 37, 17]
         output_sizes = [4, 17, 37]
 
-        quant = quant_type is not None
+        quant_config = None
+        if quant_type is not None:
+            if quant_type == "per_channel":
+                quant_config = get_symmetric_quantization_config(
+                    is_per_channel=True,
+                    is_dynamic=False,
+                )
+            elif quant_type == "per_tensor":
+                quant_config = get_symmetric_quantization_config(
+                    is_per_channel=False,
+                    is_dynamic=False,
+                )
+            else:
+                raise ValueError(f"Unsupported quant type {quant_type}")
 
         """
         Note that torch.nn.Linear maps to aten.mm.default (no bias) or aten.addmm.default (bias),
@@ -478,7 +500,6 @@ def _test_linear(
             input_size = int(input_sizes[i])
             output_size = int(output_sizes[i])
             input_shape = [in_size] * num_batch_dims + [input_size]
-            print(f"Testing input_shape {input_shape} with {output_size} out_channels")
 
             module = make_module(input_size, output_size).eval().to(dtype)
             inputs = (torch.randn(input_shape).to(dtype),)
@@ -487,28 +508,15 @@ def _test_linear(
                 dynamic_shape[i] = torch.export.Dim(f"batch{i}", min=2, max=in_size)
 
             dynamic_shape = (dynamic_shape,)
-            print(dynamic_shape)
 
             for legacy_mode in (True, False):
                 tester = Tester(module, inputs, dynamic_shapes=dynamic_shape)
 
-                if quant:
-                    if quant_type == "per_channel":
-                        quant_config = get_symmetric_quantization_config(
-                            is_per_channel=True,
-                            is_dynamic=False,
-                        )
-                    elif quant_type == "per_tensor":
-                        quant_config = get_symmetric_quantization_config(
-                            is_per_channel=False,
-                            is_dynamic=False,
-                        )
-                    else:
-                        raise ValueError(f"Unsupported quant type {quant_type}")
+                if quant_config:
                     tester.quantize(Quantize(quantization_config=quant_config))
 
                 tester.export()
-                if quant:
+                if quant_config:
                     tester.check(["torch.ops.quantized_decomposed"])
 
                 if legacy_mode:
@@ -522,12 +530,19 @@ def _test_linear(
                 )
                 tester.check_not([edge_op])
 
-                if quant:
-                    tester.check_not([edge_op, "torch.ops.quantized_decomposed"])
+                if quant_config:
+                    tester.check_not(
+                        [
+                            "executorch_exir_dialects_edge__ops_aten_mm_default",
+                            "executorch_exir_dialects_edge__ops_aten_addmm_default",
+                        ]
+                    )
 
                 tester.to_executorch()
                 tester.serialize()
-                tester.run_method_and_compare_outputs(qtol=quant, atol=atol)
+                tester.run_method_and_compare_outputs(
+                    qtol=bool(quant_config), atol=atol
+                )
 
     def _test_dqlinear(
         self,
@@ -540,24 +555,19 @@ def _test_dqlinear(
         qconfig: Optional[QuantizationConfig] = None,
         atol=5e-02,
     ):
-        edge_op = (
-            "executorch_exir_dialects_edge__ops_aten_addmm_default"
-            if uses_bias
-            else "executorch_exir_dialects_edge__ops_aten_mm_default"
-        )
-
         quant_config = qconfig or get_symmetric_quantization_config(
             is_per_channel=is_per_channel,
             is_dynamic=True,
         )
         for legacy_partitioner in (True, False):
             for per_op_mode in (True, False):
-                tester = Tester(module, inputs, dynamic_shapes=dynamic_shapes)
-                tester.quantize(Quantize(quantization_config=quant_config))
                 DynamicallyQuantizedPartitioner = XnnpackPartitioner(
                     config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
                     per_op_mode=per_op_mode,
                 )
+
+                tester = Tester(module, inputs, dynamic_shapes=dynamic_shapes)
+                tester.quantize(Quantize(quantization_config=quant_config))
                 tester.export()
 
                 if legacy_partitioner:
@@ -567,357 +577,74 @@ def _test_dqlinear(
                     tester.to_edge_transform_and_lower(
                         ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner])
                     )
-                num_call_delegates = linear_count if per_op_mode else 1
                 tester.check_count(
                     {
-                        "torch.ops.higher_order.executorch_call_delegate": num_call_delegates
+                        "torch.ops.higher_order.executorch_call_delegate": (
+                            linear_count if per_op_mode else 1
+                        )
                     }
                 )
-                tester.check_not([edge_op])
+                tester.check_not(
+                    [
+                        "executorch_exir_dialects_edge__ops_aten_mm_default",
+                        "executorch_exir_dialects_edge__ops_aten_addmm_default",
+                    ]
+                )
 
                 tester.to_executorch()
                 tester.serialize()
                 tester.run_method_and_compare_outputs(atol=atol)
 
-    class ManualDQLinear(torch.nn.Module):
-        def __init__(
-            self,
-            input_channels: int = 4,
-            output_channels: int = 4,
-            dtype: torch.dtype = torch.float,
-            weight_n_bit: int = 4,
-            group_size: int = 0,
-            force_groupwise_quant: bool = False,
-            use_bias: bool = False,
-        ):
-            super().__init__()
-
-            self.ic = input_channels
-            self.oc = output_channels
-
-            assert dtype in [torch.float, torch.half], "Unsupported op dtype"
-            self.op_dtype = dtype
-
-            self.group_size = self.ic if group_size == 0 else group_size
-            self.num_groups = 1
-            if self.group_size != self.ic:
-                assert self.ic % self.group_size == 0
-                assert self.group_size % 8 == 0  # TODO make this 16
-                self.num_groups = self.ic // self.group_size
-
-            assert weight_n_bit in [4, 8], "Unsupported weight_n_bit"
-            self.w_n_bit = weight_n_bit
-            self.w_quant_min, self.w_quant_max = self.get_min_max(self.w_n_bit)
-
-            self.w = torch.nn.Parameter(
-                torch.randn(self.oc, self.ic), requires_grad=False
-            )
-            self.w_q = torch.nn.Parameter(
-                torch.zeros(self.oc, self.ic), requires_grad=False
-            )
-            # Quantize the weights as per folded setup
-            if self.group_size != self.ic or force_groupwise_quant:
-                self.w_scales = torch.nn.Parameter(
-                    torch.zeros(self.oc, self.num_groups), requires_grad=False
-                )
-                self.w_zero_points = torch.nn.Parameter(
-                    torch.zeros(self.oc, self.num_groups), requires_grad=False
-                )
-                self.quant_weight_per_channel_group()
-            else:  # per_channel quantization
-                self.w_scales = torch.nn.Parameter(
-                    torch.zeros(self.oc), requires_grad=False
-                )
-                self.w_zero_points = torch.nn.Parameter(
-                    torch.zeros(self.oc), requires_grad=False
-                )
-                self.quant_weight_per_channel()
-
-            self.bias = (
-                torch.nn.Parameter(
-                    torch.randn(self.oc).to(self.op_dtype), requires_grad=False
-                )
-                if use_bias
-                else None
-            )
-
-        def get_min_max(self, n_bit: int = 4):
-            max_int = 2 ** (n_bit - 1) - 1
-            min_int = -(2 ** (n_bit - 1))
-            return min_int, max_int
-
-        def get_channel_qparams_symmetric(
-            self,
-            w: torch.Tensor,
-            n_bit: int = 4,
-            precision: torch.dtype = torch.float32,
-        ):
-            assert w.dim() == 2
-
-            to_quant = w.to(precision)
-            assert torch.isnan(to_quant).sum() == 0
-
-            max_val = to_quant.amax(dim=1, keepdim=True)
-            min_val = to_quant.amin(dim=1, keepdim=True)
-            min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
-            max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
-
-            min_int, max_int = self.get_min_max(n_bit)
-
-            max_val_abs = torch.max(-min_val_neg, max_val_pos)
-            scales = max_val_abs / (float(max_int - min_int) / 2)
-            scales = torch.max(
-                scales, torch.full_like(scales, torch.finfo(torch.float32).eps)
-            )
-            zeros = torch.full_like(scales, 0)
-            return scales.to(precision).reshape(w.shape[0]), zeros.to(
-                precision
-            ).reshape(w.shape[0]).reshape(w.shape[0])
-
-        # Note: not using from torchao.quantization.quant_primitives because it will run into op registraion issues
-        def get_group_qparams_symmetric(
-            self, w, n_bit=4, groupsize=128, precision=torch.float32
-        ):
-            # needed for GPTQ with padding
-            if groupsize > w.shape[-1]:
-                groupsize = w.shape[-1]
-            assert groupsize > 1
-            assert w.shape[-1] % groupsize == 0
-            assert w.dim() == 2
-
-            to_quant = w.reshape(-1, groupsize)
-            assert torch.isnan(to_quant).sum() == 0
-
-            max_val = to_quant.amax(dim=1, keepdim=True)
-            min_val = to_quant.amin(dim=1, keepdim=True)
-            min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
-            max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
-
-            max_val_abs = torch.max(-min_val_neg, max_val_pos)
-            max_int = 2 ** (n_bit - 1) - 1
-            min_int = -(2 ** (n_bit - 1))
-
-            scales = max_val_abs / (float(max_int - min_int) / 2)
-            scales = torch.max(
-                scales, torch.full_like(scales, torch.finfo(torch.float32).eps)
-            )
-            # TODO: make sure abs(scales) is not too small?
-            zeros = torch.full_like(scales, 0)
-            return scales.to(precision).reshape(w.shape[0], -1), zeros.to(
-                precision
-            ).reshape(w.shape[0], -1)
-
-        # Note: not using from torchao.quantization.quant_primitives because it will run into op registraion issues
-        def group_quantize_tensor_symmetric(
-            self, w, n_bit=4, group_size=128, precision=torch.float32
-        ):
-            scales, zeros = self.get_group_qparams_symmetric(
-                w, n_bit, group_size, precision
-            )
-            n_bit = 4
-            max_int = 2 ** (n_bit - 1) - 1
-            min_int = -(2 ** (n_bit - 1))
-            # TODO: currently we don't know how to express torch.int4, we'll
-            # add torch.int4 to core later
-            w_int8 = torch.ops.quantized_decomposed.quantize_per_channel_group(
-                w, scales, zeros, min_int, max_int, torch.int8, group_size
-            )
-
-            return w_int8, scales, zeros
-
-        def fwd_input_per_token(self, input: torch.Tensor) -> torch.Tensor:
-            ip_quant_min = -128
-            ip_quant_max = 127
-            (
-                ip_scales,
-                ip_zero_points,
-            ) = torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric(
-                input, torch.int8
-            )
-
-            input = torch.ops.quantized_decomposed.quantize_per_token(
-                input,
-                ip_scales,
-                ip_zero_points,
-                ip_quant_min,
-                ip_quant_max,
-                torch.int8,
-            )
-            input = torch.ops.quantized_decomposed.dequantize_per_token(
-                input,
-                ip_scales,
-                ip_zero_points,
-                ip_quant_min,
-                ip_quant_max,
-                torch.int8,
-                self.op_dtype,
-            )
-            return input
-
-        def quant_weight_per_channel(self):
-            (
-                self.w_scales.data,
-                self.w_zero_points.data,
-            ) = self.get_channel_qparams_symmetric(
-                self.w, n_bit=self.w_n_bit, precision=self.op_dtype
-            )
-            self.w_q.data = torch.ops.quantized_decomposed.quantize_per_channel(
-                self.w,
-                self.w_scales,
-                self.w_zero_points,
-                axis=0,
-                quant_min=self.w_quant_min,
-                quant_max=self.w_quant_max,
-                dtype=torch.int8,
-            )
-
-        def quant_weight_per_channel_group(self):
-            self.w_q.data, w, zp = self.group_quantize_tensor_symmetric(
-                self.w,
-                n_bit=self.w_n_bit,
-                group_size=self.group_size,
-            )
-            expected_min, expected_max = self.get_min_max(self.w_n_bit)
-            assert (
-                torch.min(self.w_q.data) >= expected_min
-            ), "Found smaller than min element in quantized weight tensor"
-            assert (
-                torch.max(self.w_q.data) <= expected_max
-            ), "Found larger than max element in quantized weight tensor"
-            assert (
-                w.ndim == 2 and zp.ndim == 2
-            ), f"Expecting 2d scales and zp tensors, but got {w.shape}, {zp.shape}"
-            self.w_scales.data, self.w_zero_points.data = w, zp
-
-        def fwd_weight_per_channel(self) -> torch.Tensor:
-            # This is HACKY because the dequant will produce fp32
-            return torch.ops.quantized_decomposed.dequantize_per_channel(
-                self.w_q,
-                self.w_scales,
-                self.w_zero_points,
-                axis=0,
-                quant_min=self.w_quant_min,
-                quant_max=self.w_quant_max,
-                dtype=torch.int8,  # Regardless of w_n_bit, convert to 4b later
-            )
-
-        def fwd_weight_per_channel_group(self) -> torch.Tensor:
-            return torch.ops.quantized_decomposed.dequantize_per_channel_group(
-                self.w_q,
-                self.w_scales,
-                self.w_zero_points,
-                self.w_quant_min,
-                self.w_quant_max,
-                dtype=torch.int8,  # Regardless of w_n_bit, convert to 4b later
-                group_size=self.group_size,
-                output_dtype=self.op_dtype,
-            )
-
-        def forward(self, input: torch.Tensor) -> torch.Tensor:
-            # Input
-            input = self.fwd_input_per_token(input)
-
-            # Weights
-            w = (
-                self.fwd_weight_per_channel_group()
-                if self.w_scales.ndim == 2
-                else self.fwd_weight_per_channel()
-            )
-            assert isinstance(w, torch.Tensor)
-            return torch.nn.functional.linear(input, w, self.bias)
-
-    def _test_manual_dq_linear(
+    def _test_groupwise_dq_linear(
         self,
         mod: torch.nn.Module,
         inputs: Tuple[torch.Tensor],
-        weight_groupwise: bool = False,
         use_bias: bool = False,
-        atol: float = 1e-3,
-        rtol: float = 1e-3,
+        group_size: int = 8,
+        num_linears: int = 1,
+        atol: float = 5e-3,
+        rtol: float = 5e-3,
     ):
-        linear_edge_op = (
-            "executorch_exir_dialects_edge__ops_aten_addmm_default"
-            if use_bias
-            else "executorch_exir_dialects_edge__ops_aten_mm_default"
+        quantize_(mod, int8_dynamic_activation_int4_weight(group_size=group_size))
+        unwrap_tensor_subclass(mod)
+        DynamicallyQuantizedPartitioner = XnnpackPartitioner(
+            config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
+            per_op_mode=True,
         )
-
-        weight_dq_edge_op = (
-            "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_group_default"
-            if weight_groupwise
-            else "torch.ops.quantized_decomposed.dequantize_per_channel.default"
-        )
-
-        weight_dq_aten_op = (
-            "torch.ops.quantized_decomposed.dequantize_per_channel_group.default"
-            if weight_groupwise
-            else "torch.ops.quantized_decomposed.dequantize_per_channel.default"
+        tester = (
+            Tester(mod, inputs)
+            .export()
+            .check_count(
+                {
+                    "torch.ops.quant.choose_qparams_affine.default": 1 * num_linears,
+                    "torch.ops.quant.quantize_affine.default": 1 * num_linears,
+                    "torch.ops.quant.dequantize_affine.default": 2 * num_linears,
+                    "torch.ops.aten.linear.default": 1 * num_linears,
+                }
+            )
         )
-        for legacy_partitioner in (True, False):
-            tester = (
-                Tester(mod, inputs)
-                .export()
-                .check_count(
-                    {
-                        "torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default": 1,
-                        "torch.ops.quantized_decomposed.quantize_per_token.default": 1,
-                        "torch.ops.quantized_decomposed.dequantize_per_token.default": 1,
-                        weight_dq_aten_op: 1,
-                        "torch.ops.aten.linear.default": 1,
-                    }
-                )
+        (
+            tester.to_edge_transform_and_lower(
+                ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner])
             )
+        )
 
-            DynamicallyQuantizedPartitioner = XnnpackPartitioner(
-                config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
-                per_op_mode=True,
+        (
+            tester.check_count(
+                {
+                    "torch.ops.higher_order.executorch_call_delegate": 1,
+                }
             )
-            if legacy_partitioner:
-                tester.to_edge()
-                tester.partition(Partition(DynamicallyQuantizedPartitioner))
-            else:
-                (
-                    tester.to_edge_transform_and_lower(
-                        ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner])
-                    )
-                )
-
-            (
-                tester.check_count(
-                    {
-                        "torch.ops.higher_order.executorch_call_delegate": 1,
-                    }
-                )
-                .check_not(
-                    [
-                        "executorch_exir_dialects_edge__ops_quantized_decomposed_choose_qparams_per_token_asymmetric_default",
-                        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_token_default",
-                        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_token_default",
-                        weight_dq_edge_op,
-                        linear_edge_op,
-                    ]
-                )
-                .to_executorch()
-                .serialize()
-                .run_method_and_compare_outputs(atol=atol, rtol=rtol)
+            .check_not(
+                [
+                    "executorch_exir_dialects_edge__ops_quant_choose_qparams_affine_default",
+                    "executorch_exir_dialects_edge__ops_quant_quantize_affine_default",
+                    "executorch_exir_dialects_edge__ops_quant_dequantize_affine_default",
+                    "executorch_exir_dialects_edge__ops_aten_mm_default",
+                    "executorch_exir_dialects_edge__ops_aten_addmm_default",
+                ]
             )
-
-    def _run_manual_dqlinear_tests(self, weight_n_bit: int, op_dtype: torch.dtype):
-        in_sizes = [1, 4, 4]
-        input_sizes = [4, 37, 17]
-        output_sizes = [4, 17, 37]
-
-        for use_bias in [True, False]:
-            for i, _ in enumerate(in_sizes):
-                in_size = int(in_sizes[i])
-                input_size = int(input_sizes[i])
-                output_size = int(output_sizes[i])
-                mod = self.ManualDQLinear(
-                    input_channels=input_size,
-                    output_channels=output_size,
-                    weight_n_bit=weight_n_bit,
-                    dtype=op_dtype,
-                    use_bias=use_bias,
-                )
-
-                inputs = (torch.randn(1, in_size, input_size).to(op_dtype),)
-                self._test_manual_dq_linear(mod, inputs, use_bias=use_bias)
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs(atol=atol, rtol=rtol)
+        )
diff --git a/backends/xnnpack/test/ops/lstm.py b/backends/xnnpack/test/ops/lstm.py
new file mode 100644
index 00000000000..bfc6113c417
--- /dev/null
+++ b/backends/xnnpack/test/ops/lstm.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+
+from executorch.backends.xnnpack.test.tester import Tester
+from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower
+
+
+class TestLSTM(unittest.TestCase):
+    class LSTMLinear(torch.nn.Module):
+        def __init__(self, input_size, hidden_size, out_size):
+            super().__init__()
+            self.lstm = torch.nn.LSTM(
+                input_size=input_size, hidden_size=hidden_size, batch_first=True
+            )
+            self.linear = torch.nn.Linear(hidden_size, hidden_size)
+            self.linear2 = torch.nn.Linear(hidden_size, out_size)
+
+        def forward(self, x):
+            x, hs = self.lstm(x)
+            x = self.linear(x[:, -1, :])
+            x = self.linear2(x)
+            return torch.nn.functional.log_softmax(x, dim=1)
+
+    def test_fp32_lstm(self):
+        (
+            Tester(self.LSTMLinear(32, 32, 10), (torch.rand(1, 32, 32),))
+            .export()
+            .to_edge_transform_and_lower()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_addmm_default"])
+            .check_not(
+                ["p_lstm_weight", "p_lstm_bias"]
+            )  # These Should be Consumed by Delegate
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
+
+    def test_fp32_lstm_force_dynamic_linear(self):
+        (
+            Tester(self.LSTMLinear(32, 32, 10), (torch.rand(1, 32, 32),))
+            .export()
+            .to_edge_transform_and_lower(
+                ToEdgeTransformAndLower(
+                    partitioners=[XnnpackPartitioner(force_fp32_dynamic_linear=True)]
+                )
+            )
+            .check_not(["executorch_exir_dialects_edge__ops_aten_addmm_default"])
+            # Weights are supplied as input to linears
+            .check(["p_lstm_weight_hh_l0", "p_lstm_weight_ih_l0"])
+            # Biases are owned by delegates
+            .check_not(["p_lstm_bias"])
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
diff --git a/backends/xnnpack/test/ops/mean_dim.py b/backends/xnnpack/test/ops/mean_dim.py
index e39d3aee080..3bac5f3239c 100644
--- a/backends/xnnpack/test/ops/mean_dim.py
+++ b/backends/xnnpack/test/ops/mean_dim.py
@@ -56,6 +56,19 @@ def test_fp32_mean_dim_unsupported(self):
             .check_count({"executorch_exir_dialects_edge__ops_aten_mean_dim": 1})
         )
 
+    def test_fp32_mean_dim_unsupported_3d(self):
+        """
+        XNNPack mean.dim implementation only supports 4D tensors.
+        """
+        inputs = (torch.randn(1, 5, 4),)
+        (
+            Tester(self.MeanDim((-1, -2)), inputs)
+            .export()
+            .check_count({"torch.ops.aten.mean.dim": 1})
+            .to_edge_transform_and_lower()
+            .check_count({"executorch_exir_dialects_edge__ops_aten_mean_dim": 1})
+        )
+
     def test_qs8_mean_dim(self):
         inputs = (torch.randn(1, 5, 4, 4),)
         (
diff --git a/backends/xnnpack/test/test_xnnpack_utils.py b/backends/xnnpack/test/test_xnnpack_utils.py
index c6b1513d317..ea9217e04ab 100644
--- a/backends/xnnpack/test/test_xnnpack_utils.py
+++ b/backends/xnnpack/test/test_xnnpack_utils.py
@@ -25,6 +25,12 @@
 
 # import the xnnpack backend implementation
 from executorch.backends.xnnpack.xnnpack_preprocess import XnnpackBackend
+from executorch.devtools import BundledProgram
+
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
 from executorch.exir import ExecutorchProgram, ExirExportedProgram
 from executorch.exir.backend.backend_api import to_backend, validation_disabled
 
@@ -34,12 +40,6 @@
     _load_for_executorch_from_buffer,
 )
 from executorch.extension.pytree import tree_flatten
-from executorch.sdk import BundledProgram
-
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
 
 from torch.ao.quantization import (  # @manual
     default_per_channel_symmetric_qnnpack_qconfig,
@@ -72,6 +72,7 @@
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
 )
+from torch.export import export_for_training
 
 from torch.testing import FileCheck
 
@@ -315,10 +316,11 @@ def quantize_and_test_model_with_quantizer(
     ):
         module.eval()
         # program capture
-        m = torch._export.capture_pre_autograd_graph(
+
+        m = export_for_training(
             module,
             example_inputs,
-        )
+        ).module()
 
         quantizer = XNNPACKQuantizer()
         quantization_config = get_symmetric_quantization_config()
diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py
index 6fdf1615215..7586c4f2313 100644
--- a/backends/xnnpack/test/tester/tester.py
+++ b/backends/xnnpack/test/tester/tester.py
@@ -14,7 +14,6 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 import torch
-import torch.export._trace as export_trace
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
 from executorch.backends.xnnpack.passes import XNNPACKPassManager
 from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
@@ -31,6 +30,7 @@
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 
 from executorch.exir.print_program import pretty_print, print_program
+from torch.export import export_for_training
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -157,10 +157,10 @@ def __init__(
     def run(
         self, artifact: torch.nn.Module, inputs: Optional[Tuple[torch.Tensor]]
     ) -> None:
-        captured_graph = export_trace._export(
-            artifact, inputs, pre_dispatch=True
-        ).module()
+        assert inputs is not None
+        captured_graph = export_for_training(artifact, inputs).module()
 
+        assert isinstance(captured_graph, torch.fx.GraphModule)
         prepared = prepare_pt2e(captured_graph, self.quantizer)
 
         if self.calibrate:
@@ -561,7 +561,8 @@ def to_edge(self, to_edge_stage: Optional[ToEdge] = None):
         if not to_edge_stage:
             to_edge_stage = ToEdge()
         to_edge_stage.edge_compile_conf._skip_dim_order = True
-        return self._run_stage(to_edge_stage)
+        res = self._run_stage(to_edge_stage)
+        return res
 
     def to_edge_transform_and_lower(
         self, to_edge_and_transform_stage: Optional[ToEdgeTransformAndLower] = None
diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK
index 1d139a3b4b7..87ee0b46b83 160000
--- a/backends/xnnpack/third-party/XNNPACK
+++ b/backends/xnnpack/third-party/XNNPACK
@@ -1 +1 @@
-Subproject commit 1d139a3b4b7155889c88c31f370a82c48e7ca89c
+Subproject commit 87ee0b46b834f67bad9025d4a82ed5654f3403d3
diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo
index d6860c477c9..16bfc1622c6 160000
--- a/backends/xnnpack/third-party/cpuinfo
+++ b/backends/xnnpack/third-party/cpuinfo
@@ -1 +1 @@
-Subproject commit d6860c477c99f1fce9e28eb206891af3c0e1a1d7
+Subproject commit 16bfc1622c6902d6f91d316ec54894910c620325
diff --git a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py
index bda79527178..e9b23e4a784 100644
--- a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py
+++ b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 from __future__ import print_function
+from pathlib import Path
 import collections
 import os
 import sys
@@ -36,8 +37,8 @@
     "PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
     "PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)",
@@ -46,7 +47,7 @@
     # add non-prod microkernel sources here:
 }
 
-SRC_NAMES = set([
+SRC_NAMES = {
     "OPERATOR_SRCS",
     "SUBGRAPH_SRCS",
     "LOGGING_SRCS",
@@ -81,30 +82,42 @@
     "PROD_AVX512F_MICROKERNEL_SRCS",
     "PROD_AVX512SKX_MICROKERNEL_SRCS",
     "PROD_AVX512VBMI_MICROKERNEL_SRCS",
-    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS",
     "PROD_AVX512VNNI_MICROKERNEL_SRCS",
+    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS",
     "PROD_RVV_MICROKERNEL_SRCS",
     "PROD_AVXVNNI_MICROKERNEL_SRCS",
     "AARCH32_ASM_MICROKERNEL_SRCS",
     "AARCH64_ASM_MICROKERNEL_SRCS",
 
     # add non-prod microkernel sources here:
-])
+}
 
 def handle_singleline_parse(line):
     start_index = line.find("(")
     end_index = line.find(")")
     line = line[start_index+1:end_index]
     key_val = line.split(" ")
-    return key_val[0], list(map(lambda x: x[4:], key_val[1:]))
+    return key_val[0], [x[4:] for x in key_val[1:]]
 
 def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
+    print(f"Updating sources from {cmakefile}")
     sources = collections.defaultdict(list)
     with open(os.path.join(xnnpack_path, cmakefile)) as cmake:
         lines = cmake.readlines()
         i = 0
         while i < len(lines):
             line = lines[i]
+            
+            if lines[i].startswith("INCLUDE"):
+                file, _ = handle_singleline_parse(line)
+                if file.startswith("cmake/gen/"):
+                    path = Path(xnnpack_path) / "XNNPACK" / file
+                    local_sources = update_sources(xnnpack_path, path.absolute().as_posix())
+                    for k,v in local_sources.items():
+                        if k in sources:
+                            sources[k] = sources[k] + local_sources[k]
+                        else:
+                            sources[k] = local_sources[k]
 
             if lines[i].startswith("SET") and "src/" in lines[i]:
                 name, val = handle_singleline_parse(line)
@@ -132,7 +145,7 @@ def gen_wrappers(xnnpack_path):
     xnnpack_sources = collections.defaultdict(list)
     sources = update_sources(xnnpack_path)
 
-    microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/microkernels.cmake")
+    microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/gen/microkernels.cmake")
     for key in  microkernels_sources:
         sources[key] = microkernels_sources[key]
 
@@ -186,6 +199,8 @@ def gen_wrappers(xnnpack_path):
 
 
 def main(argv):
+    print("Generating wrappers...")
+
     if argv is None or len(argv) == 0:
         gen_wrappers(".")
     else:
diff --git a/backends/xnnpack/third-party/xnnpack.buck.bzl b/backends/xnnpack/third-party/xnnpack.buck.bzl
index a1add446643..7f0a8ca6f21 100644
--- a/backends/xnnpack/third-party/xnnpack.buck.bzl
+++ b/backends/xnnpack/third-party/xnnpack.buck.bzl
@@ -1,7 +1,6 @@
 load("//third-party:glob_defs.bzl", "subdir_glob")
 load(
     ":xnnpack_src_defs.bzl",
-    "JIT_SRCS",
     "LOGGING_SRCS",
     "OPERATOR_SRCS",
     "SUBGRAPH_SRCS",
@@ -69,27 +68,6 @@ def define_xnnpack():
         ],
     )
 
-    # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
-    native.cxx_library(
-        name = "jit_memory",
-        srcs = JIT_SRCS,
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-        ]),
-        header_namespace = "",
-        compiler_flags = [
-            "-std=c++17",
-        ],
-        preferred_linkage = "static",
-        preprocessor_flags = [
-            "-DXNN_LOG_LEVEL=0",
-        ],
-        exported_deps = [
-            ":clog",
-            ":interface",
-        ],
-    )
-
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
     native.cxx_library(
         name = "operators",
@@ -139,7 +117,6 @@ def define_xnnpack():
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
-            "-DXNN_ENABLE_JIT=0",
             "-DXNN_ENABLE_SPARSE=0",
             "-DXNN_ENABLE_GEMM_M_SPECIALIZATION=0",
             "-DXNN_ENABLE_MEMOPT",
@@ -1223,7 +1200,6 @@ def define_xnnpack():
     ]
 
     ARM_XNNPACK_DEPS = [
-        ":jit_memory",
         ":ukernels_armsimd32",
         ":ukernels_fp16arith",
         ":ukernels_asm",
@@ -1246,11 +1222,10 @@ def define_xnnpack():
             "XNNPACK/src/configs/hardware-config.c",
             "XNNPACK/src/microparams-init.c",
             "XNNPACK/src/operator-run.c",
-            "XNNPACK/src/operators/post-operation.c",
             "XNNPACK/src/microkernel-utils.c",
         ],
         headers = subdir_glob([
-            ("XNNPACK/src", "xnnpack/*.h"),
+            ("XNNPACK/src", "**/*.h"),
             ("XNNPACK/include", "**/*.h"),
         ]),
         exported_headers = {
@@ -1271,7 +1246,6 @@ def define_xnnpack():
             "-DXNN_NO_X8_OPERATORS",
             "-DXNN_ENABLE_MEMOPT",
             "-DXNN_ENABLE_SPARSE=0",
-            "-DXNN_ENABLE_JIT=0",
             "-DXNN_ENABLE_ASSEMBLY",
             "-DXNN_ENABLE_GEMM_M_SPECIALIZATION",
             "-DXNN_ENABLE_ARM_DOTPROD",
diff --git a/backends/xnnpack/third-party/xnnpack_src_defs.bzl b/backends/xnnpack/third-party/xnnpack_src_defs.bzl
index 0a0beba7efd..d8ebe7c72bb 100644
--- a/backends/xnnpack/third-party/xnnpack_src_defs.bzl
+++ b/backends/xnnpack/third-party/xnnpack_src_defs.bzl
@@ -200,7 +200,6 @@ PROD_F16C_MICROKERNEL_SRCS = [
 ]
 
 PROD_XOP_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/xop.c",
 ]
 
 PROD_AVX512F_MICROKERNEL_SRCS = [
@@ -493,30 +492,18 @@ AARCH64_ASM_MICROKERNEL_SRCS = [
     "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
     "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
     "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
     "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
     "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
     "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S",
     "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S",
     "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
     "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
     "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
     "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
     "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S",
     "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S",
     "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
     "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
 ]
 
 XNNPACK_SRCS = [
diff --git a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl b/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl
index 2dbb41ff01b..a9d4af95ccf 100644
--- a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl
+++ b/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl
@@ -92,7 +92,6 @@ PROD_F16C_MICROKERNEL_SRCS = [
 ]
 
 PROD_XOP_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/xop.c",
 ]
 
 PROD_FMA3_MICROKERNEL_SRCS = [
@@ -447,28 +446,16 @@ AARCH64_ASM_MICROKERNEL_SRCS = [
     "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
     "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
     "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
     "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
     "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
     "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S",
     "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S",
     "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
     "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
     "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
     "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
     "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S",
     "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S",
     "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
     "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
 ]
diff --git a/backends/xnnpack/utils/quant_utils.py b/backends/xnnpack/utils/quant_utils.py
index d5a7ec7fd0d..7c035757a6f 100644
--- a/backends/xnnpack/utils/quant_utils.py
+++ b/backends/xnnpack/utils/quant_utils.py
@@ -4,6 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import operator
+from itertools import accumulate
+from typing import cast
+
 import torch
 from executorch.exir.backend.canonical_partitioners.config_partitioner import (
     format_target_name,
@@ -15,6 +19,7 @@
     "quantize_per_channel.default",
     "quantize_per_channel_group.default",
     "quantize_per_token.default",
+    "quantize_affine.default",
 }
 
 _DQ_OPS = {
@@ -23,12 +28,14 @@
     "dequantize_per_channel.default",
     "dequantize_per_channel_group.default",
     "dequantize_per_token.default",
+    "dequantize_affine.default",
 }
 
 
 _QPARAM_OPS = {
     "choose_qparams.tensor",
     "choose_qparams_per_token_asymmetric.default",
+    "choose_qparams_affine.default",
 }
 
 _DYNAMIC_OPS = {
@@ -43,8 +50,9 @@ def is_dynamic_qdq(node: torch.fx.Node) -> bool:
     if node.op != "call_function":
         return False
     node_name = format_target_name(node.target.__name__)  # pyre-ignore
+    is_dynamic_affine = is_per_token(node) and not is_per_channel_group(node)
 
-    return node_name in _DYNAMIC_OPS
+    return node_name in _DYNAMIC_OPS or is_dynamic_affine
 
 
 def is_qparam(node: torch.fx.Node) -> bool:
@@ -75,4 +83,106 @@ def is_per_channel(node: torch.fx.Node) -> bool:
     if not (is_quant(node) or is_dequant(node)):
         return False
 
-    return "per_channel" in node.target.__name__  # pyre-ignore
+    is_affine_per_channel_group = is_per_channel_group(node)
+    is_per_channel = "per_channel" in node.target.__name__  # pyre-ignore
+
+    return is_per_channel or is_affine_per_channel_group
+
+
+def is_affine_qdq(node: torch.fx.Node) -> bool:
+    if not (is_quant(node) or is_dequant(node)):
+        return False
+
+    return "quantize_affine" in node.target.__name__  # pyre-ignore
+
+
+def _get_block_size_input_scale(node: torch.fx.Node):
+    assert is_affine_qdq(node)
+    block_size = node.args[1]
+    input_val = node.all_input_nodes[0].meta["val"]
+    scale_val = node.all_input_nodes[1].meta["val"]
+    return block_size, input_val, scale_val
+
+
+def is_per_token(node: torch.fx.Node):
+    if not (is_quant(node) or is_dequant(node)):
+        return False
+
+    if "per_token" in node.target.__name__:  # pyre-ignore
+        return True
+    elif is_affine_qdq(node):
+        block_size, input_val, scale_val = _get_block_size_input_scale(node)
+        flag = True
+        scale_numel_expected = 1
+        for i in range(len(block_size) - 1):
+            flag &= block_size[i] == 1
+            scale_numel_expected *= input_val.shape[i]
+
+        flag &= block_size[-1] == input_val.shape[-1]
+        flag &= scale_val.numel() == scale_numel_expected
+        return flag
+
+    return False
+
+
+def is_per_channel_group(node: torch.fx.Node):
+    if not (is_quant(node) or is_dequant(node)):
+        return False
+
+    if "per_channel_group" in node.target.__name__:  # pyre-ignore
+        return True
+    elif is_affine_qdq(node):
+        block_size, input_val, scale_val = _get_block_size_input_scale(node)
+        flag = True
+        flag &= len(block_size) == 2
+        flag &= block_size[0] == 1
+        group_size = block_size[1]
+        scale_numel = list(accumulate(scale_val.shape, operator.mul))[-1]
+        input_numel = list(accumulate(input_val.shape, operator.mul))[-1]
+        flag &= input_numel == group_size * scale_numel
+        return flag
+
+    return False
+
+
+def extract_qdq_affine_op_args_for_decomposed_ops(node: torch.fx.Node):
+    if not is_affine_qdq(node):
+        return None, None
+    # make sure input_dtype and zero_point_domain have expected values
+    input_node = node.args[0]
+    scale_node = node.args[2]
+    zero_point_node = node.args[3]
+    args = [input_node, scale_node, zero_point_node]
+    assert (
+        len(node.args) > 4
+    ), f"expecting at least 6 args, got node: {node.format_node()}"
+
+    if node.args[4] != torch.int8:
+        return None, None
+    target_dtype = cast(torch.dtype, node.args[4])
+
+    if len(node.args) > 6:
+        # quant_min
+        args.append(node.args[5])
+        # quant_max
+        args.append(node.args[6])
+    else:
+        dtype_info = torch.iinfo(target_dtype)
+        quant_min = dtype_info.min
+        quant_max = dtype_info.max
+        args.append(quant_min)
+        args.append(quant_max)
+
+    # add target_dtype_node after quant_min/quant_max
+    args.append(target_dtype)
+    # zero_point_domain
+    if len(node.args) > 7 and node.args[7] != "INT":
+        return None, None
+
+    if is_per_channel_group(node):
+        block_sizes = cast(list[int], node.args[1])
+        args.append(block_sizes[-1])
+
+    args.append(node.args[-1])
+
+    return args
diff --git a/build/Codegen.cmake b/build/Codegen.cmake
index 1c309cf3bce..381cd0958fd 100644
--- a/build/Codegen.cmake
+++ b/build/Codegen.cmake
@@ -78,7 +78,8 @@ function(generate_bindings_for_kernels)
   # Executorch runtime.
   execute_process(
     COMMAND
-      "${PYTHON_EXECUTABLE}" -c "from distutils.sysconfig import get_python_lib;print(get_python_lib())"
+      "${PYTHON_EXECUTABLE}" -c
+      "from distutils.sysconfig import get_python_lib;print(get_python_lib())"
     OUTPUT_VARIABLE site-packages-out
     ERROR_VARIABLE site-packages-out-error
     RESULT_VARIABLE site-packages-result
@@ -150,9 +151,8 @@ function(gen_custom_ops_aot_lib)
   include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
   target_link_options_shared_lib(${GEN_LIB_NAME})
-  if(EXECUTORCH_BUILD_PYBIND AND APPLE)
-    target_link_libraries(${GEN_LIB_NAME} PRIVATE executorch_no_prim_ops)
-    target_link_options(${GEN_LIB_NAME} PRIVATE -undefined dynamic_lookup)
+  if(TARGET portable_lib)
+    target_link_libraries(${GEN_LIB_NAME} PRIVATE portable_lib)
   else()
     target_link_libraries(${GEN_LIB_NAME} PRIVATE executorch_no_prim_ops)
   endif()
diff --git a/build/Test.cmake b/build/Test.cmake
index b2b23cb03ad..d6ef124793c 100644
--- a/build/Test.cmake
+++ b/build/Test.cmake
@@ -5,8 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 #
-# This file is intended to have helper functions for test-related
-# CMakeLists.txt files.
+# This file is intended to have helper functions for test-related CMakeLists.txt
+# files.
 #
 # ### Editing this file ###
 #
@@ -25,61 +25,66 @@ find_package(executorch CONFIG REQUIRED)
 enable_testing()
 find_package(GTest CONFIG REQUIRED)
 
+target_link_options_shared_lib(cpuinfo)
 target_link_options_shared_lib(extension_data_loader)
 target_link_options_shared_lib(portable_kernels)
 target_link_options_shared_lib(portable_ops_lib)
+target_link_options_shared_lib(pthreadpool)
 target_link_options_shared_lib(quantized_ops_lib)
 
 # Add code coverage flags to supported compilers
 if(EXECUTORCH_USE_CPP_CODE_COVERAGE)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-    string(APPEND CMAKE_C_FLAGS  " --coverage -fprofile-abs-path")
-    string(APPEND CMAKE_CXX_FLAGS  " --coverage -fprofile-abs-path")
+    string(APPEND CMAKE_C_FLAGS " --coverage -fprofile-abs-path")
+    string(APPEND CMAKE_CXX_FLAGS " --coverage -fprofile-abs-path")
   elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-    string(APPEND CMAKE_C_FLAGS  " -fprofile-instr-generate -fcoverage-mapping")
-    string(APPEND CMAKE_CXX_FLAGS " -fprofile-instr-generate -fcoverage-mapping")
+    string(APPEND CMAKE_C_FLAGS " -fprofile-instr-generate -fcoverage-mapping")
+    string(APPEND CMAKE_CXX_FLAGS
+           " -fprofile-instr-generate -fcoverage-mapping"
+    )
   else()
-    message(ERROR "Code coverage for compiler ${CMAKE_CXX_COMPILER_ID} is unsupported")
+    message(ERROR
+            "Code coverage for compiler ${CMAKE_CXX_COMPILER_ID} is unsupported"
+    )
   endif()
 endif()
 
-# A helper function to generate a gtest cxx executable target
-# @param target_name: name for the executable
-# @param SOURCES <list_of_sources>: test sources to be compiled. Sometimes
-# util sources are used as well
-# @param EXTRA LIBS <list_of_libs>: additional libraries to be linked against
-# the target. gtest, gmock, executorch are linked by default, but Sometimes
-# user may need additional libraries like kernels.
-# We use CMake package executorch in this helper, so user can easily add
-# installed libraries.
+# A helper function to generate a gtest cxx executable target @param
+# target_name: name for the executable @param SOURCES <list_of_sources>: test
+# sources to be compiled. Sometimes util sources are used as well @param EXTRA
+# LIBS <list_of_libs>: additional libraries to be linked against the target.
+# gtest, gmock, executorch are linked by default, but Sometimes user may need
+# additional libraries like kernels. We use CMake package executorch in this
+# helper, so user can easily add installed libraries.
 #
-# Example:
-# et_cxx_test(my_test SOURCES my_test.cpp EXTRA_LIBS portable_kernels)
+# Example: et_cxx_test(my_test SOURCES my_test.cpp EXTRA_LIBS portable_kernels)
 #
 # This defines a gtest executable my_test, compiling my_test.cpp, and linking
 # against libportable_kernels.a.
 #
 function(et_cxx_test target_name)
 
-set(multi_arg_names SOURCES EXTRA_LIBS)
-cmake_parse_arguments(ET_CXX_TEST "" "" "${multi_arg_names}" ${ARGN})
+  set(multi_arg_names SOURCES EXTRA_LIBS)
+  cmake_parse_arguments(ET_CXX_TEST "" "" "${multi_arg_names}" ${ARGN})
 
-# Let files say "include <executorch/path/to/header.h>".
-target_include_directories(executorch INTERFACE ${EXECUTORCH_ROOT}/..)
+  # Let files say "include <executorch/path/to/header.h>".
+  target_include_directories(executorch INTERFACE ${EXECUTORCH_ROOT}/..)
 
-set(ET_TEST_UTIL_SOURCES ${EXECUTORCH_ROOT}/runtime/core/exec_aten/testing_util/tensor_util.cpp)
+  set(ET_TEST_UTIL_SOURCES
+      ${EXECUTORCH_ROOT}/runtime/core/exec_aten/testing_util/tensor_util.cpp
+  )
 
-add_executable(${target_name} ${ET_CXX_TEST_SOURCES} ${ET_TEST_UTIL_SOURCES})
-# Includes gtest, gmock, executorch by default
-target_link_libraries(
-  ${target_name} GTest::gtest GTest::gtest_main GTest::gmock executorch
-  ${ET_CXX_TEST_EXTRA_LIBS}
-)
+  add_executable(${target_name} ${ET_CXX_TEST_SOURCES} ${ET_TEST_UTIL_SOURCES})
+  # Includes gtest, gmock, executorch by default
+  target_link_libraries(
+    ${target_name} GTest::gtest GTest::gtest_main GTest::gmock executorch
+    ${ET_CXX_TEST_EXTRA_LIBS}
+  )
 
-# add_test adds a test target to be used by ctest.
-# We use `ExecuTorchTest` as the ctest target name for the test executable
-# Usage: cd cmake-out/path/to/test/; ctest
-# Note: currently we directly invoke the test target, without using ctest
-add_test(ExecuTorchTest ${target_name})
+  # add_test adds a test target to be used by ctest. We use `ExecuTorchTest` as
+  # the ctest target name for the test executable Usage: cd
+  # cmake-out/path/to/test/; ctest Note: currently we directly invoke the test
+  # target, without using ctest
+  add_test(ExecuTorchTest ${target_name})
 
 endfunction()
diff --git a/build/Utils.cmake b/build/Utils.cmake
index 56fc1e104b0..3ea616d5900 100644
--- a/build/Utils.cmake
+++ b/build/Utils.cmake
@@ -65,6 +65,12 @@ function(executorch_print_configuration_summary)
   message(STATUS "  EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL : "
                  "${EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL}"
   )
+  message(STATUS "  EXECUTORCH_BUILD_EXTENSION_TENSOR      : "
+                 "${EXECUTORCH_BUILD_EXTENSION_TENSOR}"
+  )
+  message(STATUS "  EXECUTORCH_BUILD_EXTENSION_TRAINING      : "
+                 "${EXECUTORCH_BUILD_EXTENSION_TRAINING}"
+  )
   message(
     STATUS
       "  EXECUTORCH_BUILD_FLATC                 : ${EXECUTORCH_BUILD_FLATC}"
@@ -97,7 +103,7 @@ function(executorch_print_configuration_summary)
                  "${EXECUTORCH_BUILD_KERNELS_QUANTIZED}"
   )
   message(
-    STATUS "  EXECUTORCH_BUILD_SDK                   : ${EXECUTORCH_BUILD_SDK}"
+    STATUS "  EXECUTORCH_BUILD_DEVTOOLS              : ${EXECUTORCH_BUILD_DEVTOOLS}"
   )
   message(
     STATUS
@@ -143,11 +149,21 @@ function(macos_kernel_link_options target_name)
   )
 endfunction()
 
+# Same as kernel_link_options but it's for MSVC linker
+function(msvc_kernel_link_options target_name)
+  target_link_options(
+    ${target_name} INTERFACE
+    "SHELL:LINKER:/WHOLEARCHIVE:$<TARGET_FILE:${target_name}>"
+  )
+endfunction()
+
 # Ensure that the load-time constructor functions run. By default, the linker
 # would remove them since there are no other references to them.
 function(target_link_options_shared_lib target_name)
   if(APPLE)
     macos_kernel_link_options(${target_name})
+  elseif(MSVC)
+    msvc_kernel_link_options(${target_name})
   else()
     kernel_link_options(${target_name})
   endif()
@@ -171,11 +187,20 @@ function(extract_sources sources_file)
       set(executorch_root ${CMAKE_CURRENT_SOURCE_DIR})
     endif()
 
+    if(ANDROID_ABI)
+      if("${ANDROID_ABI}" STREQUAL "arm64-v8a")
+        set(target_platforms_arg "--target-platforms=shim//:android-arm64")
+      elseif("${ANDROID_ABI}" STREQUAL "x86_64")
+        set(target_platforms_arg "--target-platforms=shim//:android-x86_64")
+      else()
+        message(FATAL_ERROR "Unsupported ANDROID_ABI setting ${ANDROID_ABI}. Please add it here!")
+      endif()
+    endif()
     execute_process(
       COMMAND
         ${PYTHON_EXECUTABLE} ${executorch_root}/build/extract_sources.py
         --config=${executorch_root}/build/cmake_deps.toml --out=${sources_file}
-        --buck2=${BUCK2}
+        --buck2=${BUCK2} ${target_platforms_arg}
       OUTPUT_VARIABLE gen_srcs_output
       ERROR_VARIABLE gen_srcs_error
       RESULT_VARIABLE gen_srcs_exit_code
diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index a11e54f932d..42034c254f4 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -17,15 +17,16 @@ build_jar() {
 
 build_android_native_library() {
   ANDROID_ABI="$1"
-  TOKENIZER="$2"
   ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}"
   CMAKE_OUT="cmake-out-android-${ANDROID_ABI}"
-  if [[ $TOKENIZER == "tiktoken" ]]; then
-    EXECUTORCH_USE_TIKTOKEN=ON
+  QNN_SDK_ROOT="${QNN_SDK_ROOT:-}"
+  if [ -n "$QNN_SDK_ROOT" ]; then
+    EXECUTORCH_BUILD_QNN=ON
   else
-    EXECUTORCH_USE_TIKTOKEN=OFF
+    EXECUTORCH_BUILD_QNN=OFF
   fi
 
+
   cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
     -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
     -DANDROID_ABI="${ANDROID_ABI}" \
@@ -36,9 +37,13 @@ build_android_native_library() {
     -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DEXECUTORCH_BUILD_QNN="${EXECUTORCH_BUILD_QNN}" \
+    -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
     -DCMAKE_BUILD_TYPE=Release \
     -B"${CMAKE_OUT}"
 
@@ -49,21 +54,6 @@ build_android_native_library() {
   fi
   cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release
 
-  cmake examples/models/llama2 \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI="$ANDROID_ABI" \
-    -DANDROID_PLATFORM=android-23 \
-    -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
-    -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DCMAKE_BUILD_TYPE=Release \
-    -B"${CMAKE_OUT}"/examples/models/llama2
-
-  cmake --build "${CMAKE_OUT}"/examples/models/llama2 -j "${CMAKE_JOBS}" --config Release
-
-
   cmake extension/android \
     -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI="${ANDROID_ABI}" \
@@ -71,8 +61,8 @@ build_android_native_library() {
     -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DEXECUTORCH_LOG_LEVEL=Info \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
-    -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \
     -DCMAKE_BUILD_TYPE=Release \
     -B"${CMAKE_OUT}"/extension/android
 
@@ -81,6 +71,19 @@ build_android_native_library() {
   # Copy artifacts to ABI specific directory
   mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}"
   cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
+
+  # Copy QNN related so library
+  if [ -n "$QNN_SDK_ROOT" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then
+    cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV69Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV73Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV75Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
+  fi
 }
 
 build_aar() {
@@ -93,23 +96,28 @@ build_aar() {
   # between Java and JNI
   find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \;
   # Zip all necessary files into the AAR file
-  zip -r executorch.aar libs jni/*/libexecutorch.so AndroidManifest.xml
-  zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so AndroidManifest.xml
+  zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml
+  cp executorch.aar executorch-llama.aar
   popd
 }
 
-build_android_llm_demo_app() {
+build_android_demo_apps() {
   mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
   cp ${BUILD_AAR_DIR}/executorch-llama.aar examples/demo-apps/android/LlamaDemo/app/libs
   pushd examples/demo-apps/android/LlamaDemo
   ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
   popd
+
+  mkdir -p extension/android/benchmark/app/libs
+  cp ${BUILD_AAR_DIR}/executorch.aar extension/android/benchmark/app/libs
+  pushd extension/android/benchmark
+  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
+  popd
 }
 
 collect_artifacts_to_be_uploaded() {
-  TOKENIZER="$1"
-  ARTIFACTS_DIR_NAME="$2"
-  DEMO_APP_DIR="${ARTIFACTS_DIR_NAME}/llm_demo_${TOKENIZER}"
+  ARTIFACTS_DIR_NAME="$1"
+  DEMO_APP_DIR="${ARTIFACTS_DIR_NAME}/llm_demo"
   # The app directory is named using its build flavor as a suffix.
   mkdir -p "${DEMO_APP_DIR}"
   # Collect the app and its test suite
@@ -124,20 +132,26 @@ collect_artifacts_to_be_uploaded() {
   # Collect JAR and AAR
   cp extension/android/build/libs/executorch.jar "${DEMO_APP_DIR}"
   find "${BUILD_AAR_DIR}/" -name 'executorch*.aar' -exec cp {} "${DEMO_APP_DIR}" \;
+  # Collect MiniBench APK
+  MINIBENCH_APP_DIR="${ARTIFACTS_DIR_NAME}/minibench"
+  mkdir -p "${MINIBENCH_APP_DIR}"
+  cp extension/android/benchmark/app/build/outputs/apk/debug/*.apk "${MINIBENCH_APP_DIR}"
+  cp extension/android/benchmark/app/build/outputs/apk/androidTest/debug/*.apk "${MINIBENCH_APP_DIR}"
 }
 
 BUILD_AAR_DIR="$(mktemp -d)"
 export BUILD_AAR_DIR
-ANDROID_ABIS=("arm64-v8a" "x86_64")
+if [ -z "$ANDROID_ABIS" ]; then
+  ANDROID_ABIS=("arm64-v8a" "x86_64")
+fi
 export ANDROID_ABIS
 
-TOKENIZER="${1:-bpe}"
-ARTIFACTS_DIR_NAME="$2"
+ARTIFACTS_DIR_NAME="$1"
 
 build_jar
 for ANDROID_ABI in "${ANDROID_ABIS[@]}"; do
-  build_android_native_library ${ANDROID_ABI} ${TOKENIZER}
+  build_android_native_library ${ANDROID_ABI}
 done
 build_aar
-build_android_llm_demo_app
-collect_artifacts_to_be_uploaded ${TOKENIZER} ${ARTIFACTS_DIR_NAME}
+build_android_demo_apps
+collect_artifacts_to_be_uploaded ${ARTIFACTS_DIR_NAME}
diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh
index a22fd4ecb9d..6e3b8c0c5ed 100755
--- a/build/build_apple_frameworks.sh
+++ b/build/build_apple_frameworks.sh
@@ -33,6 +33,7 @@ libexecutorch_no_prim_ops.a,\
 libextension_apple.a,\
 libextension_data_loader.a,\
 libextension_module.a,\
+libextension_tensor.a,\
 :$HEADERS_PATH"
 
 FRAMEWORK_BACKEND_COREML="backend_coreml:\
@@ -56,7 +57,7 @@ libcustom_ops.a,\
 
 FRAMEWORK_KERNELS_OPTIMIZED="kernels_optimized:\
 liboptimized_kernels.a,\
-liboptimized_ops_lib.a,\
+liboptimized_native_cpu_ops_lib.a,\
 :"
 
 FRAMEWORK_KERNELS_PORTABLE="kernels_portable:\
@@ -162,9 +163,11 @@ cmake_build() {
         -DEXECUTORCH_BUILD_COREML=$COREML \
         -DEXECUTORCH_BUILD_MPS=$MPS \
         -DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \
+        -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_APPLE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=$CUSTOM \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=$OPTIMIZED \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=$QUANTIZED \
@@ -188,6 +191,7 @@ mkdir -p "$HEADERS_PATH"
 
 "$SOURCE_ROOT_DIR"/build/print_exported_headers.py --buck2="$BUCK2" --targets \
   //extension/module: \
+  //extension/tensor: \
 | rsync -av --files-from=- "$SOURCE_ROOT_DIR" "$HEADERS_PATH/executorch"
 
 cp "$SOURCE_ROOT_DIR/extension/apple/ExecuTorch/Exported/"*.h "$HEADERS_PATH/executorch"
diff --git a/build/build_apple_llm_demo.sh b/build/build_apple_llm_demo.sh
new file mode 100755
index 00000000000..9fe1c1bcd77
--- /dev/null
+++ b/build/build_apple_llm_demo.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -euo pipefail
+
+ARTIFACTS_DIR_NAME="$1"
+APP_PATH="extension/apple/Benchmark/Benchmark"
+
+xcodebuild build-for-testing \
+  -project "${APP_PATH}.xcodeproj" \
+  -scheme Benchmark \
+  -destination "platform=iOS" \
+  -sdk iphoneos \
+  -allowProvisioningUpdates \
+  DEVELOPMENT_TEAM=78E7V7QP35 \
+  CODE_SIGN_STYLE=Manual \
+  PROVISIONING_PROFILE_SPECIFIER="ExecuTorch Benchmark" \
+  CODE_SIGN_IDENTITY="iPhone Distribution" \
+  CODE_SIGNING_REQUIRED=No \
+  CODE_SIGNING_ALLOWED=No
+
+# The hack to figure out where the xctest package locates
+BUILD_DIR=$(xcodebuild -showBuildSettings -project "$APP_PATH.xcodeproj" -json | jq -r ".[0].buildSettings.BUILD_DIR")
+
+# Prepare the demo app, debug mode here is the default from xcodebuild and match
+# with what we have in the test spec
+MODE="Release"
+PLATFORM="iphoneos"
+pushd "${BUILD_DIR}/${MODE}-${PLATFORM}"
+
+rm -rf Payload && mkdir Payload
+APP_NAME=Benchmark
+
+ls -lah
+cp -r "${APP_NAME}.app" Payload && zip -vr "${APP_NAME}.ipa" Payload
+
+popd
+
+# Prepare the test suite
+pushd "${BUILD_DIR}"
+
+ls -lah
+zip -vr "${APP_NAME}.xctestrun.zip" *.xctestrun
+
+popd
+
+if [[ -n "${ARTIFACTS_DIR_NAME}" ]]; then
+  mkdir -p "${ARTIFACTS_DIR_NAME}"
+  # Prepare all the artifacts to upload
+  cp "${BUILD_DIR}/${MODE}-${PLATFORM}/${APP_NAME}.ipa" "${ARTIFACTS_DIR_NAME}/"
+  cp "${BUILD_DIR}/${APP_NAME}.xctestrun.zip" "${ARTIFACTS_DIR_NAME}/"
+
+  ls -lah "${ARTIFACTS_DIR_NAME}/"
+fi
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index a051dad027d..c0011f175ea 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -73,6 +73,7 @@ excludes = [
 deps = [
   "executorch",
   "executorch_no_prim_ops",
+  "extension_threadpool",
   "portable_kernels",
 ]
 
@@ -116,6 +117,20 @@ deps = [
   "executorch",
 ]
 
+[targets.optimized_native_cpu_ops_oss]
+buck_targets = [
+  "//configurations:optimized_native_cpu_ops_oss",
+]
+filters = [
+  ".cpp$",
+]
+excludes = [
+]
+deps = [
+  "executorch_no_prim_ops",
+  "executorch",
+  "portable_kernels",
+]
 # ---------------------------------- core end ----------------------------------
 # ---------------------------------- extension start ----------------------------------
 [targets.extension_data_loader]
@@ -171,6 +186,58 @@ deps = [
   "extension_module",
   "extension_runner_util",
 ]
+
+[targets.extension_tensor]
+buck_targets = [
+  "//extension/tensor:tensor",
+]
+filters = [
+  ".cpp$",
+]
+deps = [
+  "executorch",
+  "executorch_no_prim_ops",
+]
+
+[targets.extension_threadpool]
+buck_targets = [
+  "//extension/threadpool:threadpool",
+]
+filters = [
+  ".cpp$",
+]
+deps = [
+  "executorch",
+  "executorch_no_prim_ops",
+]
+
+[targets.extension_training]
+buck_targets = [
+  "//extension/training/module:training_module",
+  "//extension/training/optimizer:sgd",
+]
+filters = [
+  ".cpp$",
+]
+deps = [
+  "executorch_no_prim_ops",
+]
+
+[targets.train_xor]
+buck_targets = [
+  "//extension/training/examples/XOR:train_xor",
+]
+filters = [
+  ".cpp$",
+]
+excludes = [
+  "^codegen",
+]
+deps = [
+  "executorch",
+  "executorch_no_prim_ops",
+  "portable_kernels",
+]
 # ---------------------------------- extension end ----------------------------------
 # ---------------------------------- binary start ----------------------------------
 
@@ -298,7 +365,10 @@ buck_targets = [
   "//extension/llm/custom_ops:custom_ops",
 ]
 filters = [
-  ".cpp$",
+  # Second clause is to pick up fht_neon.c/fht_avx.c from FFHT. TODO:
+  # remove filters and patch extract_sources.py's Buck query to fetch
+  # srcs; presumably filters is here to remove .h files.
+  "(.cpp$)|(fht.*\\.c$)",
 ]
 excludes = [
   "^codegen",
@@ -307,6 +377,7 @@ deps = [
   "executorch",
   "executorch_no_prim_ops",
   "optimized_kernels",
+  "extension_threadpool",
   "xnnpack_backend",
 ]
 
@@ -329,5 +400,6 @@ deps = [
   "portable_kernels",
   "quantized_kernels",
   "xnnpack_backend",
+  "optimized_native_cpu_ops_oss",
 ]
 # ---------------------------------- LLama end ----------------------------------
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 962990d7c82..18b6c7801b9 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -46,6 +46,9 @@ set(lib_list
     extension_module
     extension_module_static
     extension_runner_util
+    extension_tensor
+    extension_threadpool
+    extension_training
     xnnpack_backend
     XNNPACK
     cpuinfo
diff --git a/build/extract_sources.py b/build/extract_sources.py
index ce8b3de9812..5004fe0c508 100755
--- a/build/extract_sources.py
+++ b/build/extract_sources.py
@@ -11,7 +11,7 @@
 import re
 
 from enum import Enum
-from typing import Any, Optional, Sequence
+from typing import Any, List, Optional, Sequence
 
 from buck_util import Buck2Runner
 
@@ -96,7 +96,12 @@ def __init__(
             else:
                 self._config[k] = v
 
-    def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]:
+    def get_sources(
+        self, graph: "Graph", runner: Buck2Runner, buck_args: Optional[List[str]]
+    ) -> frozenset[str]:
+        if buck_args is None:
+            buck_args = []
+
         if self._state == Target._InitState.READY:
             return self._sources
         # Detect cycles.
@@ -113,7 +118,7 @@ def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]:
         )
 
         # Get the complete list of source files that this target depends on.
-        sources: set[str] = set(runner.run(["cquery", query]))
+        sources: set[str] = set(runner.run(["cquery", query] + buck_args))
 
         # Keep entries that match all of the filters.
         filters = [re.compile(p) for p in self._config.get("filters", [])]
@@ -128,7 +133,9 @@ def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]:
         # its deps. Remove entries that are already covered by the transitive
         # set of dependencies.
         for dep in self._config.get("deps", []):
-            sources.difference_update(graph.by_name[dep].get_sources(graph, runner))
+            sources.difference_update(
+                graph.by_name[dep].get_sources(graph, runner, buck_args)
+            )
 
         self._sources = frozenset(sources)
         self._state = Target._InitState.READY
@@ -173,6 +180,9 @@ def parse_args() -> argparse.Namespace:
         metavar="file",
         help="Path to the file to generate.",
     )
+    parser.add_argument(
+        "--target-platforms", help="--target-platforms to pass to buck cquery, if any."
+    )
     return parser.parse_args()
 
 
@@ -199,8 +209,12 @@ def main():
     # Run the queries and get the lists of source files.
     target_to_srcs: dict[str, list[str]] = {}
     runner: Buck2Runner = Buck2Runner(args.buck2)
+    buck_args = []
+    if args.target_platforms:
+        buck_args = ["--target-platforms"]
+        buck_args.append(args.target_platforms)
     for name, target in graph.by_name.items():
-        target_to_srcs[name] = sorted(target.get_sources(graph, runner))
+        target_to_srcs[name] = sorted(target.get_sources(graph, runner, buck_args))
 
     # Generate the requested format.
     output: bytes
diff --git a/build/pip_data_bin_init.py.in b/build/pip_data_bin_init.py.in
index 9644c5621df..0c9d60e0498 100644
--- a/build/pip_data_bin_init.py.in
+++ b/build/pip_data_bin_init.py.in
@@ -21,7 +21,9 @@ def _find_executable_files_under(dir):
     for filename in os.listdir(dir):
         filepath = os.path.join(dir, filename)
         if os.path.isfile(filepath) and os.access(filepath, os.X_OK):
-            bin_names.append(filename)
+            # Remove .exe suffix on windows.
+            filename_without_ext = os.path.splitext(filename)[0]
+            bin_names.append(filename_without_ext)
     return bin_names
 
 # The list of binaries to create wrapper functions for.
diff --git a/build/test_ios_ci.sh b/build/test_ios_ci.sh
index 5fa6ef7d246..50c6448d4b2 100755
--- a/build/test_ios_ci.sh
+++ b/build/test_ios_ci.sh
@@ -11,6 +11,9 @@ APP_PATH="examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo"
 MODEL_NAME="mv3"
 SIMULATOR_NAME="executorch"
 
+# If this is set, copy the build artifacts to this directory
+ARTIFACTS_DIR_NAME="$1"
+
 finish() {
   EXIT_STATUS=$?
   if xcrun simctl list | grep -q "$SIMULATOR_NAME"; then
@@ -64,3 +67,49 @@ xcodebuild test \
   -project "$APP_PATH.xcodeproj" \
   -scheme MobileNetClassifierTest \
   -destination name="$SIMULATOR_NAME"
+
+# NB: https://docs.aws.amazon.com/devicefarm/latest/developerguide/test-types-ios-xctest-ui.html
+say "Package The Test Suite"
+
+xcodebuild build-for-testing \
+  -project "$APP_PATH.xcodeproj" \
+  -scheme MobileNetClassifierTest \
+  -destination platform="iOS" \
+  -allowProvisioningUpdates \
+  DEVELOPMENT_TEAM=78E7V7QP35 \
+  CODE_SIGN_STYLE=Manual \
+  PROVISIONING_PROFILE_SPECIFIER=ExecuTorchDemo \
+  CODE_SIGN_IDENTITY="iPhone Distribution"
+
+# The hack to figure out where the xctest package locates
+BUILD_DIR=$(xcodebuild -showBuildSettings -project "$APP_PATH.xcodeproj" -json | jq -r ".[0].buildSettings.BUILD_DIR")
+
+# Prepare the demo app
+MODE="Debug"
+PLATFORM="iphoneos"
+pushd "${BUILD_DIR}/${MODE}-${PLATFORM}"
+
+rm -rf Payload && mkdir Payload
+MOCK_APP_NAME=ExecuTorchDemo
+
+ls -lah
+cp -r "${MOCK_APP_NAME}.app" Payload && zip -vr "${MOCK_APP_NAME}.ipa" Payload
+
+popd
+
+# Prepare the test suite
+pushd "${BUILD_DIR}"
+
+ls -lah
+zip -vr "${MOCK_APP_NAME}.xctestrun.zip" *.xctestrun
+
+popd
+
+if [[ -n "${ARTIFACTS_DIR_NAME}" ]]; then
+  mkdir -p "${ARTIFACTS_DIR_NAME}"
+  # Prepare all the artifacts to upload
+  cp "${BUILD_DIR}/${MODE}-${PLATFORM}/${MOCK_APP_NAME}.ipa" "${ARTIFACTS_DIR_NAME}/"
+  cp "${BUILD_DIR}/${MOCK_APP_NAME}.xctestrun.zip" "${ARTIFACTS_DIR_NAME}/"
+
+  ls -lah "${ARTIFACTS_DIR_NAME}/"
+fi
diff --git a/codegen/templates/RegisterCodegenUnboxedKernels.cpp b/codegen/templates/RegisterCodegenUnboxedKernels.cpp
index a7790be7fed..3076cde1a99 100644
--- a/codegen/templates/RegisterCodegenUnboxedKernels.cpp
+++ b/codegen/templates/RegisterCodegenUnboxedKernels.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/span.h>
 #include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/platform/profiler.h>
 #include "${fn_header}" // Generated Function import headers
@@ -21,7 +22,8 @@
 // JIT op registry instead of c10 dispatcher. JIT op registry only takes boxed
 // kernels, so we are calling unboxing functions in UnboxingFunctions.h to cast
 // arguments into C++ types (instead of IValue) and delegate to unboxed kernels.
-using KernelArrayRef = ::torch::executor::ArrayRef<::torch::executor::Kernel>;
+using KernelSpan =
+    ::executorch::runtime::Span<const ::executorch::runtime::Kernel>;
 namespace torch {
 namespace executor {
 namespace function {
@@ -31,15 +33,15 @@ static Kernel kernels_to_register[] = {
     ${unboxed_kernels} // Generated kernels
 };
 
-// Explicitly convert to ArrayRef, so that the API can take an empty C array of
+// Explicitly convert to Span, so that the API can take an empty C array of
 // Kernels.
-static KernelArrayRef kernel_array_ref(
+static KernelSpan kernel_span(
     kernels_to_register,
     kernels_to_register + sizeof(kernels_to_register) / sizeof(Kernel));
 
 // Return value not used. Keep the static variable assignment to register
 // kernels in static initialization time.
-static auto success_with_kernel_reg = register_kernels(kernel_array_ref);
+static auto success_with_kernel_reg = register_kernels(kernel_span);
 } // namespace
 } // namespace function
 } // namespace executor
diff --git a/codegen/templates/RegisterKernels.cpp b/codegen/templates/RegisterKernels.cpp
index 2313a30a307..91eac200222 100644
--- a/codegen/templates/RegisterKernels.cpp
+++ b/codegen/templates/RegisterKernels.cpp
@@ -19,7 +19,8 @@ Error register_all_kernels() {
   Kernel kernels_to_register[] = {
       ${unboxed_kernels} // Generated kernels
   };
-  Error success_with_kernel_reg = register_kernels(kernels_to_register);
+  Error success_with_kernel_reg =
+      ::executorch::runtime::register_kernels({kernels_to_register});
   if (success_with_kernel_reg != Error::Ok) {
     ET_LOG(Error, "Failed register all kernels");
     return success_with_kernel_reg;
diff --git a/codegen/tools/gen_oplist.py b/codegen/tools/gen_oplist.py
index f21fb8dc6b5..fbb191a6a81 100644
--- a/codegen/tools/gen_oplist.py
+++ b/codegen/tools/gen_oplist.py
@@ -230,7 +230,7 @@ def gen_oplist(
     if model_file_path:
         assert os.path.isfile(
             model_file_path
-        ), "The value for --model_file_path needs to be a valid file."
+        ), f"The value for --model_file_path needs to be a valid file, got {model_file_path}"
         op_set.update(_get_operators(model_file_path))
         source_name = model_file_path
         et_kernel_metadata = merge_et_kernel_metadata(
@@ -239,7 +239,7 @@ def gen_oplist(
     if ops_schema_yaml_path:
         assert os.path.isfile(
             ops_schema_yaml_path
-        ), "The value for --ops_schema_yaml_path needs to be a valid file."
+        ), f"The value for --ops_schema_yaml_path needs to be a valid file, got {ops_schema_yaml_path}"
         et_kernel_metadata = merge_et_kernel_metadata(
             et_kernel_metadata,
             _get_et_kernel_metadata_from_ops_yaml(ops_schema_yaml_path),
@@ -300,14 +300,33 @@ def main(args: List[Any]) -> None:
     )
     options = parser.parse_args(args)
 
-    gen_oplist(
-        output_path=options.output_path,
-        model_file_path=options.model_file_path,
-        ops_schema_yaml_path=options.ops_schema_yaml_path,
-        root_ops=options.root_ops,
-        ops_dict=options.ops_dict,
-        include_all_operators=options.include_all_operators,
-    )
+    try:
+        gen_oplist(
+            output_path=options.output_path,
+            model_file_path=options.model_file_path,
+            ops_schema_yaml_path=options.ops_schema_yaml_path,
+            root_ops=options.root_ops,
+            ops_dict=options.ops_dict,
+            include_all_operators=options.include_all_operators,
+        )
+    except Exception as e:
+        command = ["python codegen/tools/gen_oplist.py"]
+        if options.model_file_path:
+            command.append(f"--model_file_path {options.model_file_path}")
+        if options.ops_schema_yaml_path:
+            command.append(f"--ops_schema_yaml_path {options.ops_schema_yaml_path}")
+        if options.root_ops:
+            command.append(f"--root_ops {options.root_ops}")
+        if options.ops_dict:
+            command.append(f"--ops_dict {options.ops_dict}")
+        if options.include_all_operators:
+            command.append("--include-all-operators")
+        repro_command = " ".join(command)
+        raise RuntimeError(
+            f"""Failed to generate selected_operators.yaml. Repro command:
+            {repro_command}
+            """
+        ) from e
 
 
 if __name__ == "__main__":
diff --git a/codegen/tools/test/test_gen_oplist.py b/codegen/tools/test/test_gen_oplist.py
index d455ddb6899..bd1d0082489 100644
--- a/codegen/tools/test/test_gen_oplist.py
+++ b/codegen/tools/test/test_gen_oplist.py
@@ -42,7 +42,7 @@ def test_gen_op_list_with_wrong_path(
         mock_get_operators: NonCallableMock,
     ) -> None:
         args = ["--output_path=wrong_path", "--model_file_path=path2"]
-        with self.assertRaises(AssertionError):
+        with self.assertRaises(RuntimeError):
             gen_oplist.main(args)
 
     @patch("executorch.codegen.tools.gen_oplist._get_kernel_metadata_for_model")
diff --git a/configurations/targets.bzl b/configurations/targets.bzl
index dc88c137441..6a5341c2904 100644
--- a/configurations/targets.bzl
+++ b/configurations/targets.bzl
@@ -20,7 +20,7 @@ def define_common_targets():
     runtime.cxx_library(
         name = "executor_cpu_optimized",
         exported_deps = [
-            "//executorch/backends/xnnpack/threadpool:threadpool",
+            "//executorch/extension/threadpool:threadpool",
         ] + get_all_cpu_backend_targets(),
         visibility = [
             "//executorch/test/...",
@@ -28,7 +28,7 @@ def define_common_targets():
         ],
     )
 
-    # Add a commong configuration of cpu optimized operators. This adds a bit of confusion
+    # Add a common configuration of cpu optimized operators. This adds a bit of confusion
     # with the above executorch_cpu_optimized target. Generally it would make sense
     # to just add optimized operators to that target but because executorch_cpu_optimized
     # might be used elsewhere, I dont want to include ops in that target and find out
@@ -50,3 +50,21 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
     )
+
+    # TODO(T183193812): delete this target after optimized-oss.yaml is gone
+    executorch_generated_lib(
+        name = "optimized_native_cpu_ops_oss",
+        deps = [
+            "//executorch/kernels/optimized:optimized_operators",
+            "//executorch/kernels/optimized:optimized_oplist",
+            "//executorch/kernels/portable:executorch_aten_ops",
+            "//executorch/kernels/portable:operators",
+        ],
+        functions_yaml_target = "//executorch/kernels/optimized:optimized-oss.yaml",
+        fallback_yaml_target = "//executorch/kernels/portable:functions.yaml",
+        define_static_targets = True,
+        visibility = [
+            "//executorch/examples/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
diff --git a/sdk/CMakeLists.txt b/devtools/CMakeLists.txt
similarity index 86%
rename from sdk/CMakeLists.txt
rename to devtools/CMakeLists.txt
index 79903fc315e..776d421a8d3 100644
--- a/sdk/CMakeLists.txt
+++ b/devtools/CMakeLists.txt
@@ -78,8 +78,8 @@ set_property(TARGET flatccrt PROPERTY POSITION_INDEPENDENT_CODE ON)
 include(ExternalProject)
 
 # The include directory that will contain the generated schema headers.
-set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/sdk/include")
-set(_bundled_schema__include_dir "${CMAKE_BINARY_DIR}/sdk/bundled_program")
+set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/devtools/include")
+set(_bundled_schema__include_dir "${CMAKE_BINARY_DIR}/devtools/bundled_program")
 
 # TODO(dbort): Only enable this when cross-compiling. It can cause build race
 # conditions (libflatcc.a errors) when enabled.
@@ -92,11 +92,11 @@ if(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT)
   # headers on the host during the build, even if we're cross-compiling the
   # flatcc runtime to a different architecture.
   execute_process(
-    COMMAND ${CMAKE_COMMAND} ${_flatcc_source_dir}
-    -DFLATCC_TEST=OFF -DFLATCC_REFLECTION=OFF
-    # See above comment about POSITION_INDEPENDENT_CODE.
-    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    -B${CMAKE_BINARY_DIR}/_host_build
+    COMMAND
+      ${CMAKE_COMMAND} ${_flatcc_source_dir} -DFLATCC_TEST=OFF
+      -DFLATCC_REFLECTION=OFF
+      # See above comment about POSITION_INDEPENDENT_CODE.
+      -DCMAKE_POSITION_INDEPENDENT_CODE=ON -B${CMAKE_BINARY_DIR}/_host_build
   )
   execute_process(
     COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR}/_host_build
@@ -128,11 +128,11 @@ set(_etdump_schema__outputs)
 foreach(fbs_file ${_etdump_schema_names})
   string(REGEX REPLACE "[.]fbs$" "_reader.h" generated "${fbs_file}")
   list(APPEND _etdump_schema__outputs
-       "${_program_schema__include_dir}/executorch/sdk/etdump/${generated}"
+       "${_program_schema__include_dir}/executorch/devtools/etdump/${generated}"
   )
   string(REGEX REPLACE "[.]fbs$" "_builder.h" generated "${fbs_file}")
   list(APPEND _etdump_schema__outputs
-       "${_program_schema__include_dir}/executorch/sdk/etdump/${generated}"
+       "${_program_schema__include_dir}/executorch/devtools/etdump/${generated}"
   )
 endforeach()
 
@@ -143,7 +143,7 @@ foreach(fbs_file ${_bundled_input_schema_names})
   list(
     APPEND
     _bundled_program_schema__outputs
-    "${_bundled_schema__include_dir}/executorch/sdk/bundled_program/schema/${generated}"
+    "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema/${generated}"
   )
 endforeach()
 
@@ -152,9 +152,9 @@ add_library(
   bundled_program_schema INTERFACE ${_bundled_program_schema__outputs}
 )
 
-file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/sdk/etdump)
+file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/devtools/etdump)
 file(MAKE_DIRECTORY
-     ${_program_schema__include_dir}/executorch/sdk/bundled_program
+     ${_program_schema__include_dir}/executorch/devtools/bundled_program
 )
 
 add_custom_command(
@@ -164,7 +164,7 @@ add_custom_command(
     # tree instead of under the binary directory, and there's no way to change
     # that behavior.
     ${_flatcc_source_dir}/bin/flatcc -cwr -o
-    ${_program_schema__include_dir}/executorch/sdk/etdump
+    ${_program_schema__include_dir}/executorch/devtools/etdump
     ${_etdump_schema__srcs}
   COMMAND rm -f ${_etdump_schema_cleanup_paths}
   DEPENDS ${_etdump_schema_gen_dep}
@@ -186,9 +186,9 @@ add_custom_command(
   OUTPUT ${_bundled_program_schema__outputs}
   COMMAND
     ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
-    "${_bundled_schema__include_dir}/executorch/sdk/bundled_program/schema"
+    "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema"
     ${_bundled_program_schema__srcs}
-  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/sdk
+  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/devtools
   DEPENDS ${FLATC_EXECUTABLE} ${_bundled_program_schema__srcs}
   COMMENT "Generating bundled_program headers"
   VERBATIM
diff --git a/sdk/TARGETS b/devtools/TARGETS
similarity index 54%
rename from sdk/TARGETS
rename to devtools/TARGETS
index 56d38a4ad3b..06964b83876 100644
--- a/sdk/TARGETS
+++ b/devtools/TARGETS
@@ -6,8 +6,8 @@ python_library(
     name = "lib",
     srcs = ["__init__.py"],
     deps = [
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/etrecord:etrecord",
-        "//executorch/sdk/inspector:lib",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/etrecord:etrecord",
+        "//executorch/devtools/inspector:lib",
     ],
 )
diff --git a/sdk/__init__.py b/devtools/__init__.py
similarity index 57%
rename from sdk/__init__.py
rename to devtools/__init__.py
index 11134bf276a..821d75901f2 100644
--- a/sdk/__init__.py
+++ b/devtools/__init__.py
@@ -4,10 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import executorch.sdk.inspector as inspector
-from executorch.sdk.bundled_program.core import BundledProgram
-from executorch.sdk.etrecord import ETRecord, generate_etrecord, parse_etrecord
-from executorch.sdk.inspector import Inspector
+import executorch.devtools.inspector as inspector
+from executorch.devtools.bundled_program.core import BundledProgram
+from executorch.devtools.etrecord import ETRecord, generate_etrecord, parse_etrecord
+from executorch.devtools.inspector import Inspector
 
 __all__ = [
     "ETRecord",
diff --git a/sdk/backend_debug/TARGETS b/devtools/backend_debug/TARGETS
similarity index 100%
rename from sdk/backend_debug/TARGETS
rename to devtools/backend_debug/TARGETS
diff --git a/sdk/backend_debug/__init__.py b/devtools/backend_debug/__init__.py
similarity index 83%
rename from sdk/backend_debug/__init__.py
rename to devtools/backend_debug/__init__.py
index c1c9726b86b..b457b7d11d5 100644
--- a/sdk/backend_debug/__init__.py
+++ b/devtools/backend_debug/__init__.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from executorch.sdk.backend_debug.delegation_info import (
+from executorch.devtools.backend_debug.delegation_info import (
     DelegationBreakdown,
     get_delegation_info,
 )
diff --git a/sdk/backend_debug/delegation_info.py b/devtools/backend_debug/delegation_info.py
similarity index 100%
rename from sdk/backend_debug/delegation_info.py
rename to devtools/backend_debug/delegation_info.py
diff --git a/sdk/backend_debug/tests/TARGETS b/devtools/backend_debug/tests/TARGETS
similarity index 86%
rename from sdk/backend_debug/tests/TARGETS
rename to devtools/backend_debug/tests/TARGETS
index 3c9f6c2e64e..ae234df8ce4 100644
--- a/sdk/backend_debug/tests/TARGETS
+++ b/devtools/backend_debug/tests/TARGETS
@@ -10,8 +10,8 @@ python_unittest(
     deps = [
         "fbsource//third-party/pypi/pandas:pandas",
         "//caffe2:torch",
+        "//executorch/devtools/backend_debug:delegation_info",
         "//executorch/exir:lib",
         "//executorch/exir/backend/test:op_partitioner_demo",
-        "//executorch/sdk/backend_debug:delegation_info",
     ],
 )
diff --git a/sdk/backend_debug/tests/test_delegation_info.py b/devtools/backend_debug/tests/test_delegation_info.py
similarity index 96%
rename from sdk/backend_debug/tests/test_delegation_info.py
rename to devtools/backend_debug/tests/test_delegation_info.py
index 2d98e9a5950..6ff5169094b 100644
--- a/sdk/backend_debug/tests/test_delegation_info.py
+++ b/devtools/backend_debug/tests/test_delegation_info.py
@@ -9,9 +9,9 @@
 import pandas as pd
 
 import torch
+from executorch.devtools.backend_debug import DelegationBreakdown, get_delegation_info
 from executorch.exir import to_edge
 from executorch.exir.backend.test.op_partitioner_demo import AddMulPartitionerDemo
-from executorch.sdk.backend_debug import DelegationBreakdown, get_delegation_info
 from pandas.testing import assert_frame_equal
 
 
diff --git a/sdk/bundled_program/TARGETS b/devtools/bundled_program/TARGETS
similarity index 88%
rename from sdk/bundled_program/TARGETS
rename to devtools/bundled_program/TARGETS
index c731606217f..27560f70877 100644
--- a/sdk/bundled_program/TARGETS
+++ b/devtools/bundled_program/TARGETS
@@ -18,10 +18,10 @@ runtime.python_library(
         ":config",
         ":version",
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program/schema:bundled_program_schema_py",
         "//executorch/exir:schema",
         "//executorch/exir:tensor",
         "//executorch/exir/_serialize:lib",
-        "//executorch/sdk/bundled_program/schema:bundled_program_schema_py",
     ],
 )
 
@@ -46,6 +46,6 @@ runtime.python_library(
         "version.py",
     ],
     visibility = [
-        "//executorch/sdk/...",
+        "//executorch/devtools/...",
     ],
 )
diff --git a/sdk/bundled_program/bundled_program.cpp b/devtools/bundled_program/bundled_program.cpp
similarity index 91%
rename from sdk/bundled_program/bundled_program.cpp
rename to devtools/bundled_program/bundled_program.cpp
index 63affa5c7f7..54f84f6fef1 100644
--- a/sdk/bundled_program/bundled_program.cpp
+++ b/devtools/bundled_program/bundled_program.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/sdk/bundled_program/bundled_program.h>
+#include <executorch/devtools/bundled_program/bundled_program.h>
 
 #include <cmath>
 #include <cstddef>
@@ -16,20 +16,28 @@
 #include <ATen/ATen.h>
 #endif // USE_ATEN_LIB
 
+#include <executorch/devtools/bundled_program/schema/bundled_program_schema_generated.h>
 #include <executorch/runtime/core/event_tracer_hooks.h>
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/platform/log.h>
-#include <executorch/sdk/bundled_program/schema/bundled_program_schema_generated.h>
 
-namespace torch {
-namespace executor {
+using exec_aten::ArrayRef;
+using exec_aten::Half;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::Method;
+using ::executorch::runtime::Result;
+
+namespace executorch {
 namespace bundled_program {
 
 namespace {
 
-#define kMaxDim 16
+constexpr size_t kMaxDim = 16;
 
 #ifdef USE_ATEN_LIB
 
@@ -53,6 +61,7 @@ at::Tensor tensor_like(bundled_program_flatbuffer::Tensor* bundled_tensor) {
 }
 
 #else // !USE_ATEN_LIB
+using torch::executor::TensorImpl;
 // Create a tensorimpl with same content using bundled tensor
 TensorImpl impl_like(bundled_program_flatbuffer::Tensor* bundled_tensor) {
   ScalarType scalar_type =
@@ -234,9 +243,9 @@ get_method_test_suite(
 } // namespace
 
 // Load testset_idx-th bundled data into the Method
-ET_NODISCARD Error LoadBundledInput(
+ET_NODISCARD Error load_bundled_input(
     Method& method,
-    serialized_bundled_program* bundled_program_ptr,
+    SerializedBundledProgram* bundled_program_ptr,
     size_t testset_idx) {
   ET_CHECK_OR_RETURN_ERROR(
       bundled_program_flatbuffer::BundledProgramBufferHasIdentifier(
@@ -319,19 +328,19 @@ ET_NODISCARD Error LoadBundledInput(
     ET_CHECK_OR_RETURN_ERROR(
         status == Error::Ok,
         NotSupported,
-        "set_input failed during load bundled inputs with status %" PRIu32,
-        static_cast<error_code_t>(status));
+        "set_input failed during load bundled inputs with status 0%" PRIx32,
+        static_cast<uint32_t>(status));
   }
 
-  internal::event_tracer_set_bundled_input_index(
+  ::executorch::runtime::internal::event_tracer_set_bundled_input_index(
       method.get_event_tracer(), testset_idx);
 
   return Error::Ok;
 }
 
-ET_NODISCARD Error VerifyResultWithBundledExpectedOutput(
+ET_NODISCARD Error verify_method_outputs(
     Method& method,
-    serialized_bundled_program* bundled_program_ptr,
+    SerializedBundledProgram* bundled_program_ptr,
     size_t testset_idx,
     double rtol,
     double atol) {
@@ -390,12 +399,12 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput(
   return Error::Ok;
 }
 
-ET_NODISCARD Error GetProgramData(
+ET_NODISCARD Error get_program_data(
     void* file_data,
     size_t file_data_len,
     const void** out_program_data,
     size_t* out_program_data_len) {
-  if (IsBundledProgram(file_data)) {
+  if (is_bundled_program(file_data, file_data_len)) {
     auto program_bundled =
         bundled_program_flatbuffer::GetBundledProgram(file_data);
     *out_program_data = program_bundled->program()->data();
@@ -410,11 +419,13 @@ ET_NODISCARD Error GetProgramData(
   return Error::Ok;
 }
 
-bool IsBundledProgram(void* file_data) {
+bool is_bundled_program(void* file_data, ET_UNUSED size_t file_data_len) {
+  // Even though the flatbuffer API doesn't accept a length, it's important to
+  // require one so that we could change the internal representation, or use a
+  // future API that does require a length.
   return bundled_program_flatbuffer::BundledProgramBufferHasIdentifier(
       file_data);
 }
 
 } // namespace bundled_program
-} // namespace executor
-} // namespace torch
+} // namespace executorch
diff --git a/sdk/bundled_program/bundled_program.h b/devtools/bundled_program/bundled_program.h
similarity index 55%
rename from sdk/bundled_program/bundled_program.h
rename to devtools/bundled_program/bundled_program.h
index 8b42923866e..884ca6f21bc 100644
--- a/sdk/bundled_program/bundled_program.h
+++ b/devtools/bundled_program/bundled_program.h
@@ -11,14 +11,13 @@
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/executor/method.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
 namespace bundled_program {
 
 /**
  * An opaque pointer to a serialized bundled program.
  */
-using serialized_bundled_program = const void;
+using SerializedBundledProgram = const void;
 
 /**
  * Load testset_idx-th bundled input of method_idx-th Method test in
@@ -31,9 +30,9 @@ using serialized_bundled_program = const void;
  * @returns Return Error::Ok if load successfully, or the error happens during
  * execution.
  */
-ET_NODISCARD Error LoadBundledInput(
-    Method& method,
-    serialized_bundled_program* bundled_program_ptr,
+ET_NODISCARD ::executorch::runtime::Error load_bundled_input(
+    ::executorch::runtime::Method& method,
+    SerializedBundledProgram* bundled_program_ptr,
     size_t testset_idx);
 
 /**
@@ -49,9 +48,9 @@ ET_NODISCARD Error LoadBundledInput(
  * @returns Return Error::Ok if two outputs match, or the error happens during
  * execution.
  */
-ET_NODISCARD Error VerifyResultWithBundledExpectedOutput(
-    Method& method,
-    serialized_bundled_program* bundled_program_ptr,
+ET_NODISCARD ::executorch::runtime::Error verify_method_outputs(
+    ::executorch::runtime::Method& method,
+    SerializedBundledProgram* bundled_program_ptr,
     size_t testset_idx,
     double rtol = 1e-5,
     double atol = 1e-8);
@@ -73,7 +72,7 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput(
  * in it, and out_program_data/out_program_data_len point to the data. Other
  * values on failure.
  */
-ET_NODISCARD Error GetProgramData(
+ET_NODISCARD ::executorch::runtime::Error get_program_data(
     void* file_data,
     size_t file_data_len,
     const void** out_program_data,
@@ -83,11 +82,61 @@ ET_NODISCARD Error GetProgramData(
  * Checks whether the given file is a bundled program.
  *
  * @param[in] file_data The contents of the given file.
+ * @param[in] file_data_len The length of file_data, in bytes.
  *
  * @returns true if the given file is a bundled program, false otherwise
  */
-bool IsBundledProgram(void* file_data);
+bool is_bundled_program(void* file_data, size_t file_data_len);
+
+/// DEPRECATED: Use the version with the file_data_len parameter.
+ET_DEPRECATED inline bool is_bundled_program(void* file_data) {
+  // 128 is enough data to contain the identifier in the flatbuffer header.
+  return is_bundled_program(file_data, 128);
+}
+
+} // namespace bundled_program
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace bundled_program {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using serialized_bundled_program =
+    ::executorch::bundled_program::SerializedBundledProgram;
+
+ET_NODISCARD inline ::executorch::runtime::Error LoadBundledInput(
+    ::executorch::runtime::Method& method,
+    serialized_bundled_program* bundled_program_ptr,
+    size_t testset_idx) {
+  return ::executorch::bundled_program::load_bundled_input(
+      method, bundled_program_ptr, testset_idx);
+}
+
+ET_NODISCARD inline ::executorch::runtime::Error
+VerifyResultWithBundledExpectedOutput(
+    ::executorch::runtime::Method& method,
+    serialized_bundled_program* bundled_program_ptr,
+    size_t testset_idx,
+    double rtol = 1e-5,
+    double atol = 1e-8) {
+  return ::executorch::bundled_program::verify_method_outputs(
+      method, bundled_program_ptr, testset_idx, rtol, atol);
+}
+
+ET_NODISCARD inline ::executorch::runtime::Error GetProgramData(
+    void* file_data,
+    size_t file_data_len,
+    const void** out_program_data,
+    size_t* out_program_data_len) {
+  return ::executorch::bundled_program::get_program_data(
+      file_data, file_data_len, out_program_data, out_program_data_len);
+}
 
+inline bool IsBundledProgram(void* file_data) {
+  // 128 is enough data to contain the identifier in the flatbuffer header.
+  return ::executorch::bundled_program::is_bundled_program(file_data, 128);
+}
 } // namespace bundled_program
 } // namespace executor
 } // namespace torch
diff --git a/sdk/bundled_program/config.py b/devtools/bundled_program/config.py
similarity index 88%
rename from sdk/bundled_program/config.py
rename to devtools/bundled_program/config.py
index 3bfbe7bc69c..97563177603 100644
--- a/sdk/bundled_program/config.py
+++ b/devtools/bundled_program/config.py
@@ -39,7 +39,7 @@
 """
 All supported types for input/expected output of MethodTestCase.
 
-Namedtuple is also supported and listed implicity since it is a subclass of tuple.
+Namedtuple is also supported and listed implicitly since it is a subclass of tuple.
 """
 
 # pyre-ignore
@@ -59,23 +59,23 @@ def __init__(
         """Single test case for verifying specific method
 
         Args:
-            input: All inputs required by eager_model with specific inference method for one-time execution.
+            inputs: All inputs required by eager_model with specific inference method for one-time execution.
 
                     It is worth mentioning that, although both bundled program and ET runtime apis support setting input
                     other than `torch.tensor` type, only the input in `torch.tensor` type will be actually updated in
                     the method, and the rest of the inputs will just do a sanity check if they match the default value in method.
 
-            expected_output: Expected output of given input for verification. It can be None if user only wants to use the test case for profiling.
+            expected_outputs: Expected output of given input for verification. It can be None if user only wants to use the test case for profiling.
 
         Returns:
             self
         """
         # TODO(gasoonjia): Update type check logic.
-        # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sannity check.
+        # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sanity check.
         self.inputs: List[ConfigValue] = self._flatten_and_sanity_check(inputs)
         self.expected_outputs: List[ConfigValue] = []
         if expected_outputs is not None:
-            # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sannity check.
+            # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sanity check.
             self.expected_outputs = self._flatten_and_sanity_check(expected_outputs)
 
     def _flatten_and_sanity_check(
diff --git a/sdk/bundled_program/core.py b/devtools/bundled_program/core.py
similarity index 98%
rename from sdk/bundled_program/core.py
rename to devtools/bundled_program/core.py
index 4fede5e5952..c775fb1510d 100644
--- a/sdk/bundled_program/core.py
+++ b/devtools/bundled_program/core.py
@@ -8,19 +8,19 @@
 import typing
 from typing import Dict, List, Optional, Sequence, Type, Union
 
-import executorch.exir.schema as core_schema
+import executorch.devtools.bundled_program.schema as bp_schema
 
-import executorch.sdk.bundled_program.schema as bp_schema
+import executorch.exir.schema as core_schema
 
 import torch
 import torch.fx
+from executorch.devtools.bundled_program.config import ConfigValue, MethodTestSuite
+
+from executorch.devtools.bundled_program.version import BUNDLED_PROGRAM_SCHEMA_VERSION
 
 from executorch.exir import ExecutorchProgram, ExecutorchProgramManager
 from executorch.exir._serialize import _serialize_pte_binary
 from executorch.exir.tensor import get_scalar_type, scalar_type_enum, TensorSpec
-from executorch.sdk.bundled_program.config import ConfigValue, MethodTestSuite
-
-from executorch.sdk.bundled_program.version import BUNDLED_PROGRAM_SCHEMA_VERSION
 
 # pyre-ignore
 supported_program_type_table: Dict[Type[core_schema.KernelTypes], ConfigValue] = {
@@ -230,7 +230,7 @@ def _assert_valid_bundle(
         Other checks not related to correspondence are done in config.py
 
         Args:
-            program: The program to be bundled.
+            executorch_program: The program to be bundled.
             method_test_suites: The testcases for specific methods to be bundled.
         """
 
diff --git a/sdk/bundled_program/schema/README.md b/devtools/bundled_program/schema/README.md
similarity index 100%
rename from sdk/bundled_program/schema/README.md
rename to devtools/bundled_program/schema/README.md
diff --git a/sdk/bundled_program/schema/TARGETS b/devtools/bundled_program/schema/TARGETS
similarity index 84%
rename from sdk/bundled_program/schema/TARGETS
rename to devtools/bundled_program/schema/TARGETS
index e9bd642069d..51c004cbec0 100644
--- a/sdk/bundled_program/schema/TARGETS
+++ b/devtools/bundled_program/schema/TARGETS
@@ -15,8 +15,8 @@ runtime.python_library(
         "bundled_program_schema.py",
     ],
     visibility = [
-        "//executorch/sdk/bundled_program/...",
-        "//executorch/sdk/etrecord/...",
+        "//executorch/devtools/bundled_program/...",
+        "//executorch/devtools/etrecord/...",
     ],
     deps = [
         "//executorch/exir:scalar_type",
diff --git a/sdk/bundled_program/schema/__init__.py b/devtools/bundled_program/schema/__init__.py
similarity index 100%
rename from sdk/bundled_program/schema/__init__.py
rename to devtools/bundled_program/schema/__init__.py
diff --git a/sdk/bundled_program/schema/bundled_program_schema.fbs b/devtools/bundled_program/schema/bundled_program_schema.fbs
similarity index 100%
rename from sdk/bundled_program/schema/bundled_program_schema.fbs
rename to devtools/bundled_program/schema/bundled_program_schema.fbs
diff --git a/sdk/bundled_program/schema/bundled_program_schema.py b/devtools/bundled_program/schema/bundled_program_schema.py
similarity index 100%
rename from sdk/bundled_program/schema/bundled_program_schema.py
rename to devtools/bundled_program/schema/bundled_program_schema.py
diff --git a/sdk/bundled_program/schema/scalar_type.fbs b/devtools/bundled_program/schema/scalar_type.fbs
similarity index 100%
rename from sdk/bundled_program/schema/scalar_type.fbs
rename to devtools/bundled_program/schema/scalar_type.fbs
diff --git a/sdk/bundled_program/schema/targets.bzl b/devtools/bundled_program/schema/targets.bzl
similarity index 93%
rename from sdk/bundled_program/schema/targets.bzl
rename to devtools/bundled_program/schema/targets.bzl
index a25d792c5a3..532a01e039e 100644
--- a/sdk/bundled_program/schema/targets.bzl
+++ b/devtools/bundled_program/schema/targets.bzl
@@ -49,14 +49,14 @@ def define_common_targets():
     runtime.export_file(
         name = INPUT_BUNDLED,
         visibility = [
-            "//executorch/sdk/bundled_program/serialize/...",
+            "//executorch/devtools/bundled_program/serialize/...",
         ],
     )
 
     runtime.export_file(
         name = INPUT_SCALAR_TYPE,
         visibility = [
-            "//executorch/sdk/bundled_program/serialize/...",
+            "//executorch/devtools/bundled_program/serialize/...",
         ],
     )
 
@@ -72,7 +72,7 @@ def define_common_targets():
         name = BUNDLED_LIBRARY_NAME,
         srcs = [],
         visibility = [
-            "//executorch/sdk/bundled_program/...",
+            "//executorch/devtools/bundled_program/...",
             "//executorch/extension/pybindings/...",
         ],
         exported_headers = {
diff --git a/sdk/bundled_program/schema/test/TARGETS b/devtools/bundled_program/schema/test/TARGETS
similarity index 100%
rename from sdk/bundled_program/schema/test/TARGETS
rename to devtools/bundled_program/schema/test/TARGETS
diff --git a/sdk/bundled_program/schema/test/test_schema.py b/devtools/bundled_program/schema/test/test_schema.py
similarity index 79%
rename from sdk/bundled_program/schema/test/test_schema.py
rename to devtools/bundled_program/schema/test/test_schema.py
index ab3d2760d29..c2a19adef79 100644
--- a/sdk/bundled_program/schema/test/test_schema.py
+++ b/devtools/bundled_program/schema/test/test_schema.py
@@ -20,8 +20,8 @@ def test_schema_sync(self) -> None:
 
         self.assertTrue(
             filecmp.cmp(
-                prefix + "sdk/bundled_program/schema/scalar_type.fbs",
+                prefix + "devtools/bundled_program/schema/scalar_type.fbs",
                 prefix + "schema/scalar_type.fbs",
             ),
-            'Please run "hg cp fbcode//executorch/schema/scalar_type.fbs fbcode//executorch/sdk/bundled_program/schema/scalar_type.fbs" to sync schema changes.',
+            'Please run "hg cp fbcode//executorch/schema/scalar_type.fbs fbcode//executorch/devtools/bundled_program/schema/scalar_type.fbs" to sync schema changes.',
         )
diff --git a/sdk/bundled_program/serialize/TARGETS b/devtools/bundled_program/serialize/TARGETS
similarity index 76%
rename from sdk/bundled_program/serialize/TARGETS
rename to devtools/bundled_program/serialize/TARGETS
index 20abccd7fda..11c58399778 100644
--- a/sdk/bundled_program/serialize/TARGETS
+++ b/devtools/bundled_program/serialize/TARGETS
@@ -10,8 +10,8 @@ runtime.python_library(
         "__init__.py",
     ],
     resources = {
-        "//executorch/sdk/bundled_program/schema:bundled_program_schema.fbs": "bundled_program_schema.fbs",
-        "//executorch/sdk/bundled_program/schema:scalar_type.fbs": "scalar_type.fbs",
+        "//executorch/devtools/bundled_program/schema:bundled_program_schema.fbs": "bundled_program_schema.fbs",
+        "//executorch/devtools/bundled_program/schema:scalar_type.fbs": "scalar_type.fbs",
     },
     # Currently serialization API should only be used in some dedicated targets,
     # to avoid ODR violation when linking with another Flatbuffers library.
@@ -20,18 +20,18 @@ runtime.python_library(
         "//executorch/bacends/...",
         "//executorch/backends/xnnpack/test/...",
         "//executorch/codegen/...",
+        "//executorch/devtools/bundled_program/tests/...",
         "//executorch/examples/async_exec:emit_program_lib",
         "//executorch/exir:lib",
         "//executorch/extension/pybindings/test:test",
         "//executorch/extension/pybindings/test:test-library",
         "//executorch/profiler/...",
-        "//executorch/sdk/bundled_program/tests/...",
         "//executorch/test/...",
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
         "fbsource//third-party/pypi/setuptools:setuptools",
+        "//executorch/devtools/bundled_program/schema:bundled_program_schema_py",
         "//executorch/exir/_serialize:lib",
-        "//executorch/sdk/bundled_program/schema:bundled_program_schema_py",
     ],
 )
diff --git a/sdk/bundled_program/serialize/__init__.py b/devtools/bundled_program/serialize/__init__.py
similarity index 97%
rename from sdk/bundled_program/serialize/__init__.py
rename to devtools/bundled_program/serialize/__init__.py
index e0c75574c93..075436e9c11 100644
--- a/sdk/bundled_program/serialize/__init__.py
+++ b/devtools/bundled_program/serialize/__init__.py
@@ -12,14 +12,14 @@
 import os
 import tempfile
 
-import executorch.sdk.bundled_program.schema as bp_schema
+import executorch.devtools.bundled_program.schema as bp_schema
 
 # @manual=fbsource//third-party/pypi/setuptools:setuptools
 import pkg_resources
+from executorch.devtools.bundled_program.core import BundledProgram
 
 from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
 from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile
-from executorch.sdk.bundled_program.core import BundledProgram
 
 # The prefix of schema files used for bundled program
 BUNDLED_PROGRAM_SCHEMA_NAME = "bundled_program_schema"
diff --git a/sdk/bundled_program/serialize/test/TARGETS b/devtools/bundled_program/serialize/test/TARGETS
similarity index 51%
rename from sdk/bundled_program/serialize/test/TARGETS
rename to devtools/bundled_program/serialize/test/TARGETS
index 85f55c02f8d..dd92f63f2dd 100644
--- a/sdk/bundled_program/serialize/test/TARGETS
+++ b/devtools/bundled_program/serialize/test/TARGETS
@@ -10,9 +10,8 @@ python_unittest(
         "test_serialize.py",
     ],
     deps = [
-        "//executorch/exir:print_program",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/bundled_program/serialize:lib",
-        "//executorch/sdk/bundled_program/util:test_util",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/bundled_program/serialize:lib",
+        "//executorch/devtools/bundled_program/util:test_util",
     ],
 )
diff --git a/sdk/bundled_program/serialize/test/test_serialize.py b/devtools/bundled_program/serialize/test/test_serialize.py
similarity index 82%
rename from sdk/bundled_program/serialize/test/test_serialize.py
rename to devtools/bundled_program/serialize/test/test_serialize.py
index 1db6871fc06..48a914d1447 100644
--- a/sdk/bundled_program/serialize/test/test_serialize.py
+++ b/devtools/bundled_program/serialize/test/test_serialize.py
@@ -8,13 +8,15 @@
 
 import unittest
 
-from executorch.sdk.bundled_program.core import BundledProgram
+from executorch.devtools.bundled_program.core import BundledProgram
 
-from executorch.sdk.bundled_program.serialize import (
+from executorch.devtools.bundled_program.serialize import (
     deserialize_from_flatbuffer_to_bundled_program,
     serialize_from_bundled_program_to_flatbuffer,
 )
-from executorch.sdk.bundled_program.util.test_util import get_common_executorch_program
+from executorch.devtools.bundled_program.util.test_util import (
+    get_common_executorch_program,
+)
 
 
 class TestSerialize(unittest.TestCase):
diff --git a/sdk/bundled_program/targets.bzl b/devtools/bundled_program/targets.bzl
similarity index 91%
rename from sdk/bundled_program/targets.bzl
rename to devtools/bundled_program/targets.bzl
index a3268dff2c5..7035b3b31f6 100644
--- a/sdk/bundled_program/targets.bzl
+++ b/devtools/bundled_program/targets.bzl
@@ -19,7 +19,7 @@ def define_common_targets():
             ],
             deps = [
                 "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix,
-                "//executorch/sdk/bundled_program/schema:bundled_program_schema_fbs",
+                "//executorch/devtools/bundled_program/schema:bundled_program_schema_fbs",
             ],
             exported_deps = [
                 "//executorch/runtime/core:memory_allocator",
diff --git a/sdk/bundled_program/test/TARGETS b/devtools/bundled_program/test/TARGETS
similarity index 68%
rename from sdk/bundled_program/test/TARGETS
rename to devtools/bundled_program/test/TARGETS
index caf69be60e1..652c74b8f43 100644
--- a/sdk/bundled_program/test/TARGETS
+++ b/devtools/bundled_program/test/TARGETS
@@ -1,4 +1,5 @@
 # @noautodeps
+
 load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
 
 oncall("executorch")
@@ -10,11 +11,11 @@ python_unittest(
     ],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/bundled_program/schema:bundled_program_schema_py",
+        "//executorch/devtools/bundled_program/util:test_util",
         "//executorch/exir/_serialize:lib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/bundled_program/schema:bundled_program_schema_py",
-        "//executorch/sdk/bundled_program/util:test_util",
     ],
 )
 
@@ -25,9 +26,9 @@ python_unittest(
     ],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program/util:test_util",
         "//executorch/extension/pytree:pylib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program/util:test_util",
     ],
 )
 
@@ -38,6 +39,10 @@ python_unittest(
     ],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/bundled_program/serialize:lib",
+        "//executorch/devtools/bundled_program/util:test_util",
         "//executorch/exir:dynamic_shape",
         "//executorch/exir:lib",
         "//executorch/exir:memory",
@@ -54,9 +59,5 @@ python_unittest(
         "//executorch/extension/pybindings:portable_lib",
         "//executorch/extension/pytree:pybindings",
         "//executorch/kernels/portable:custom_ops_generated_lib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/bundled_program/serialize:lib",
-        "//executorch/sdk/bundled_program/util:test_util",
     ],
 )
diff --git a/sdk/bundled_program/test/test_bundle_data.py b/devtools/bundled_program/test/test_bundle_data.py
similarity index 93%
rename from sdk/bundled_program/test/test_bundle_data.py
rename to devtools/bundled_program/test/test_bundle_data.py
index a8d9485c5ff..565539cbf15 100644
--- a/sdk/bundled_program/test/test_bundle_data.py
+++ b/devtools/bundled_program/test/test_bundle_data.py
@@ -9,13 +9,15 @@
 import unittest
 from typing import List
 
-import executorch.sdk.bundled_program.schema as bp_schema
+import executorch.devtools.bundled_program.schema as bp_schema
 
 import torch
+from executorch.devtools.bundled_program.config import ConfigValue
+from executorch.devtools.bundled_program.core import BundledProgram
+from executorch.devtools.bundled_program.util.test_util import (
+    get_common_executorch_program,
+)
 from executorch.exir._serialize import _serialize_pte_binary
-from executorch.sdk.bundled_program.config import ConfigValue
-from executorch.sdk.bundled_program.core import BundledProgram
-from executorch.sdk.bundled_program.util.test_util import get_common_executorch_program
 
 
 class TestBundle(unittest.TestCase):
diff --git a/sdk/bundled_program/test/test_config.py b/devtools/bundled_program/test/test_config.py
similarity index 97%
rename from sdk/bundled_program/test/test_config.py
rename to devtools/bundled_program/test/test_config.py
index 3183ad907fe..21f3d480423 100644
--- a/sdk/bundled_program/test/test_config.py
+++ b/devtools/bundled_program/test/test_config.py
@@ -10,14 +10,14 @@
 from typing import get_args, List, Union
 
 import torch
-from executorch.extension.pytree import tree_flatten
-from executorch.sdk.bundled_program.config import DataContainer
+from executorch.devtools.bundled_program.config import DataContainer
 
-from executorch.sdk.bundled_program.util.test_util import (
+from executorch.devtools.bundled_program.util.test_util import (
     get_random_test_suites,
     get_random_test_suites_with_eager_model,
     SampleModel,
 )
+from executorch.extension.pytree import tree_flatten
 
 
 class TestConfig(unittest.TestCase):
diff --git a/sdk/bundled_program/test/test_end2end.py b/devtools/bundled_program/test/test_end2end.py
similarity index 88%
rename from sdk/bundled_program/test/test_end2end.py
rename to devtools/bundled_program/test/test_end2end.py
index 99d58ee15ca..7cee073be0e 100644
--- a/sdk/bundled_program/test/test_end2end.py
+++ b/devtools/bundled_program/test/test_end2end.py
@@ -21,12 +21,12 @@
 
 import torch
 
-from executorch.sdk.bundled_program.core import BundledProgram
-from executorch.sdk.bundled_program.serialize import (
+from executorch.devtools.bundled_program.core import BundledProgram
+from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
 
-from executorch.sdk.bundled_program.util.test_util import (
+from executorch.devtools.bundled_program.util.test_util import (
     get_common_executorch_program,
     SampleModel,
 )
@@ -45,7 +45,7 @@
     pass
 
 try:
-    from executorch.extension.pybindings.aten_lib import (
+    from executorch.extension.pybindings.aten_lib import (  # @manual=//executorch/extension/pybindings:aten_lib
         _load_bundled_program_from_buffer,
         _load_for_executorch_from_buffer,
         _load_for_executorch_from_bundled_program,
diff --git a/sdk/bundled_program/util/TARGETS b/devtools/bundled_program/util/TARGETS
similarity index 68%
rename from sdk/bundled_program/util/TARGETS
rename to devtools/bundled_program/util/TARGETS
index 17d19dfb29a..7d019ce30fb 100644
--- a/sdk/bundled_program/util/TARGETS
+++ b/devtools/bundled_program/util/TARGETS
@@ -7,10 +7,10 @@ python_library(
     srcs = [
         "test_util.py",
     ],
-    visibility = ["//executorch/sdk/bundled_program/..."],
+    visibility = ["//executorch/devtools/bundled_program/..."],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
         "//executorch/exir:lib",
-        "//executorch/sdk/bundled_program:config",
     ],
 )
diff --git a/sdk/bundled_program/util/test_util.py b/devtools/bundled_program/util/test_util.py
similarity index 99%
rename from sdk/bundled_program/util/test_util.py
rename to devtools/bundled_program/util/test_util.py
index bfea8158acb..505186f3a08 100644
--- a/sdk/bundled_program/util/test_util.py
+++ b/devtools/bundled_program/util/test_util.py
@@ -10,14 +10,14 @@
 from typing import List, Tuple
 
 import torch
-
-from executorch.exir import ExecutorchProgramManager, to_edge
-from executorch.sdk.bundled_program.config import (
+from executorch.devtools.bundled_program.config import (
     MethodInputType,
     MethodOutputType,
     MethodTestCase,
     MethodTestSuite,
 )
+
+from executorch.exir import ExecutorchProgramManager, to_edge
 from torch.export import export
 from torch.export.unflatten import _assign_attr, _AttrKind
 
diff --git a/sdk/bundled_program/version.py b/devtools/bundled_program/version.py
similarity index 100%
rename from sdk/bundled_program/version.py
rename to devtools/bundled_program/version.py
diff --git a/sdk/debug_format/TARGETS b/devtools/debug_format/TARGETS
similarity index 100%
rename from sdk/debug_format/TARGETS
rename to devtools/debug_format/TARGETS
diff --git a/sdk/debug_format/base_schema.py b/devtools/debug_format/base_schema.py
similarity index 94%
rename from sdk/debug_format/base_schema.py
rename to devtools/debug_format/base_schema.py
index b987c288744..9b6247051ec 100644
--- a/sdk/debug_format/base_schema.py
+++ b/devtools/debug_format/base_schema.py
@@ -4,8 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 """
-Base Intermediate Representation for Productivity SDK consumers
+Base Intermediate Representation for Developer Tools consumers
 (e.g. TensorBoard, Terminal Debugger)
 """
 
diff --git a/sdk/debug_format/et_schema.py b/devtools/debug_format/et_schema.py
similarity index 99%
rename from sdk/debug_format/et_schema.py
rename to devtools/debug_format/et_schema.py
index 9a6af4edba9..bb15d70abc4 100644
--- a/sdk/debug_format/et_schema.py
+++ b/devtools/debug_format/et_schema.py
@@ -4,8 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 """
-Intermediate Representation of ExecuTorch Concepts in Productivity SDK
+Intermediate Representation of ExecuTorch Concepts in Developer Tools
 """
 
 from __future__ import annotations
@@ -21,7 +23,7 @@
 
 import torch
 from executorch import exir
-from executorch.sdk.debug_format.base_schema import (
+from executorch.devtools.debug_format.base_schema import (
     Node,
     OperatorGraph,
     OperatorNode,
diff --git a/sdk/etdump/TARGETS b/devtools/etdump/TARGETS
similarity index 81%
rename from sdk/etdump/TARGETS
rename to devtools/etdump/TARGETS
index 22d07478cbe..7dcc4c1e84b 100644
--- a/sdk/etdump/TARGETS
+++ b/devtools/etdump/TARGETS
@@ -11,7 +11,7 @@ runtime.python_library(
         "schema_flatcc.py",
     ],
     visibility = [
-        "//executorch/sdk/...",
+        "//executorch/devtools/...",
     ],
     deps = [
         "//executorch/exir:scalar_type",
@@ -24,11 +24,11 @@ runtime.python_library(
         "serialize.py",
     ],
     resources = {
+        "//executorch/devtools/etdump:etdump_schema_flatcc.fbs": "etdump_schema_flatcc.fbs",
         "//executorch/schema:scalar_type.fbs": "scalar_type.fbs",
-        "//executorch/sdk/etdump:etdump_schema_flatcc.fbs": "etdump_schema_flatcc.fbs",
     },
     visibility = [
-        "//executorch/sdk/...",
+        "//executorch/devtools/...",
     ],
     deps = [
         "fbsource//third-party/pypi/setuptools:setuptools",
diff --git a/sdk/etdump/emitter.cpp b/devtools/etdump/emitter.cpp
similarity index 66%
rename from sdk/etdump/emitter.cpp
rename to devtools/etdump/emitter.cpp
index 1b3cba9d196..653c75cb084 100644
--- a/sdk/etdump/emitter.cpp
+++ b/devtools/etdump/emitter.cpp
@@ -6,16 +6,25 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <stdio.h>
+#include <executorch/devtools/etdump/emitter.h>
+
 #include <cstdint>
+#include <cstring>
+
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+#include <executorch/runtime/platform/assert.h>
+
+#include <flatcc/flatcc_builder.h>
 
-#include "executorch/runtime/platform/assert.h"
-#include "executorch/sdk/etdump/emitter.h"
+using executorch::etdump::internal::ETDumpStaticAllocator;
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace etdump {
+namespace internal {
 
-static int _allocator_fn(
+namespace {
+
+int allocator_fn(
     void* alloc_context,
     flatcc_iovec_t* b,
     size_t request,
@@ -24,8 +33,8 @@ static int _allocator_fn(
   void* p;
   size_t n;
 
-  struct etdump_static_allocator* state =
-      (struct etdump_static_allocator*)alloc_context;
+  ETDumpStaticAllocator* state =
+      reinterpret_cast<ETDumpStaticAllocator*>(alloc_context);
 
   // This allocator doesn't support freeing memory.
   if (request == 0) {
@@ -113,14 +122,14 @@ static int _allocator_fn(
 
 // This emitter implementation emits to a fixed size buffer and will fail if it
 // runs out of room on either end.
-static int _emitter_fn(
+int emitter_fn(
     void* emit_context,
     const flatcc_iovec_t* iov,
     int iov_count,
     flatbuffers_soffset_t offset,
     size_t len) {
-  struct etdump_static_allocator* E =
-      (struct etdump_static_allocator*)emit_context;
+  ETDumpStaticAllocator* E =
+      reinterpret_cast<ETDumpStaticAllocator*>(emit_context);
   uint8_t* p;
 
   if (offset < 0) {
@@ -144,40 +153,15 @@ static int _emitter_fn(
   return 0;
 }
 
-/*******************************************************************************
- * Public Functions
- ******************************************************************************/
-
-int etdump_static_allocator_builder_init(
-    flatcc_builder_t* builder,
-    struct etdump_static_allocator* alloc) {
-  ET_CHECK(builder != nullptr);
-  ET_CHECK(alloc != nullptr);
-
-  // Ensure data size is multiple of 32 (minimum allocation size).
-  ET_CHECK((alloc->data_size & 0x1F) == 0);
-  // Ensure out_size is divisable by 2 to ensure front/back sizes are equal for
-  // emitter..
-  ET_CHECK((alloc->out_size & 0x1) == 0);
-
-  return flatcc_builder_custom_init(
-      builder, _emitter_fn, alloc, _allocator_fn, alloc);
-}
-
-void etdump_static_allocator_reset(struct etdump_static_allocator* alloc) {
-  ET_CHECK(alloc != nullptr);
-  alloc->allocated = 0;
-  size_t n = alloc->out_size / 2;
-  alloc->front_cursor = &alloc->data[alloc->data_size + n];
-  alloc->front_left = n;
-}
+} // namespace
 
-int et_flatcc_custom_init(
+int etdump_flatcc_custom_init(
     flatcc_builder_t* builder,
-    struct etdump_static_allocator* alloc) {
+    struct ETDumpStaticAllocator* alloc) {
   return flatcc_builder_custom_init(
-      builder, _emitter_fn, alloc, _allocator_fn, alloc);
+      builder, emitter_fn, alloc, allocator_fn, alloc);
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace internal
+} // namespace etdump
+} // namespace executorch
diff --git a/devtools/etdump/emitter.h b/devtools/etdump/emitter.h
new file mode 100644
index 00000000000..09c1b56aa56
--- /dev/null
+++ b/devtools/etdump/emitter.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <cstdlib>
+
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+
+typedef struct flatcc_builder flatcc_builder_t;
+
+namespace executorch {
+namespace etdump {
+namespace internal {
+
+int etdump_flatcc_custom_init(
+    flatcc_builder_t* builder,
+    internal::ETDumpStaticAllocator* alloc);
+
+} // namespace internal
+} // namespace etdump
+} // namespace executorch
diff --git a/sdk/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp
similarity index 53%
rename from sdk/etdump/etdump_flatcc.cpp
rename to devtools/etdump/etdump_flatcc.cpp
index dab1443b55f..4c05bb5acee 100644
--- a/sdk/etdump/etdump_flatcc.cpp
+++ b/devtools/etdump/etdump_flatcc.cpp
@@ -6,19 +6,33 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "executorch/sdk/etdump/etdump_flatcc.h"
-#include <executorch/sdk/etdump/etdump_schema_flatcc_builder.h>
-#include <executorch/sdk/etdump/etdump_schema_flatcc_reader.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+
+#include <cstring>
+
+#include <executorch/devtools/etdump/emitter.h>
+#include <executorch/devtools/etdump/etdump_schema_flatcc_builder.h>
+#include <executorch/devtools/etdump/etdump_schema_flatcc_reader.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/assert.h>
+
 #include <flatcc/flatcc_types.h>
-#include <stdio.h>
-#include <string.h>
-#include "executorch/runtime/core/exec_aten/exec_aten.h"
-#include "executorch/runtime/core/exec_aten/util/scalar_type_util.h"
-#include "executorch/runtime/platform/assert.h"
-#include "executorch/sdk/etdump/emitter.h"
 
-namespace torch {
-namespace executor {
+using ::exec_aten::Tensor;
+using ::executorch::runtime::AllocatorID;
+using ::executorch::runtime::ArrayRef;
+using ::executorch::runtime::ChainID;
+using ::executorch::runtime::DebugHandle;
+using ::executorch::runtime::DelegateDebugIdType;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::EventTracerEntry;
+using ::executorch::runtime::LoggedEValueType;
+using ::executorch::runtime::Span;
+using ::executorch::runtime::Tag;
+
+namespace executorch {
+namespace etdump {
 
 namespace {
 
@@ -50,30 +64,30 @@ executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type(
 }
 
 etdump_Tensor_ref_t add_tensor_entry(
-    flatcc_builder_t* builder,
+    flatcc_builder_t* builder_,
     const exec_aten::Tensor& tensor,
     long offset) {
-  etdump_Tensor_start(builder);
+  etdump_Tensor_start(builder_);
 
   etdump_Tensor_scalar_type_add(
-      builder, get_flatbuffer_scalar_type(tensor.scalar_type()));
-  etdump_Tensor_sizes_start(builder);
+      builder_, get_flatbuffer_scalar_type(tensor.scalar_type()));
+  etdump_Tensor_sizes_start(builder_);
 
   for (auto dim : tensor.sizes()) {
     int64_t cast_dim = static_cast<int64_t>(dim);
-    etdump_Tensor_sizes_push(builder, &cast_dim);
+    etdump_Tensor_sizes_push(builder_, &cast_dim);
   }
-  etdump_Tensor_sizes_end(builder);
+  etdump_Tensor_sizes_end(builder_);
 
-  etdump_Tensor_strides_start(builder);
+  etdump_Tensor_strides_start(builder_);
   for (auto dim : tensor.strides()) {
     int64_t cast_dim = static_cast<int64_t>(dim);
-    etdump_Tensor_strides_push(builder, &cast_dim);
+    etdump_Tensor_strides_push(builder_, &cast_dim);
   }
-  etdump_Tensor_strides_end(builder);
-  etdump_Tensor_offset_add(builder, offset);
+  etdump_Tensor_strides_end(builder_);
+  etdump_Tensor_offset_add(builder_, offset);
 
-  return etdump_Tensor_end(builder);
+  return etdump_Tensor_end(builder_);
 }
 
 static uint8_t* alignPointer(void* ptr, size_t alignment) {
@@ -88,71 +102,71 @@ static uint8_t* alignPointer(void* ptr, size_t alignment) {
 
 } // namespace
 
-constexpr size_t max_alloc_buf_size = 128 * 1024;
-
 // Constructor implementation
 ETDumpGen::ETDumpGen(Span<uint8_t> buffer) {
-  // Initialize the flatcc builder using the buffer and buffer size.
+  constexpr size_t max_alloc_buf_size = 128 * 1024;
+
+  // Initialize the flatcc builder_ using the buffer and buffer size.
 
   if (buffer.data() != nullptr) {
-    builder = (struct flatcc_builder*)alignPointer(buffer.data(), 64);
+    builder_ = (struct flatcc_builder*)alignPointer(buffer.data(), 64);
     uintptr_t buffer_with_builder =
-        (uintptr_t)alignPointer(builder + sizeof(struct flatcc_builder), 64);
+        (uintptr_t)alignPointer(builder_ + sizeof(struct flatcc_builder), 64);
     size_t buffer_size = buffer.size() -
         (size_t)(buffer_with_builder - (uintptr_t)buffer.data());
-    alloc.set_buffer(
+    alloc_.set_buffer(
         (uint8_t*)buffer_with_builder,
         buffer_size,
         (size_t)((buffer_size / 4 > max_alloc_buf_size) ? max_alloc_buf_size
                                                         : buffer_size / 4));
-    et_flatcc_custom_init(builder, &alloc);
+    internal::etdump_flatcc_custom_init(builder_, &alloc_);
   } else {
-    builder = (struct flatcc_builder*)malloc(sizeof(struct flatcc_builder));
+    builder_ = (struct flatcc_builder*)malloc(sizeof(struct flatcc_builder));
     ET_CHECK_MSG(
-        builder != nullptr, "Failed to allocate memory for flatcc builder.");
-    flatcc_builder_init(builder);
+        builder_ != nullptr, "Failed to allocate memory for flatcc builder_.");
+    flatcc_builder_init(builder_);
   }
   reset();
 }
 
 ETDumpGen::~ETDumpGen() {
-  flatcc_builder_clear(builder);
+  flatcc_builder_clear(builder_);
   if (!is_static_etdump()) {
-    free(builder);
+    free(builder_);
   }
 }
 
 void ETDumpGen::reset() {
-  etdump_gen_state = ETDumpGen_Init;
-  num_blocks = 0;
-  flatcc_builder_reset(builder);
-  flatbuffers_buffer_start(builder, etdump_ETDump_file_identifier);
-  etdump_ETDump_start_as_root_with_size(builder);
-  etdump_ETDump_version_add(builder, ETDUMP_VERSION);
-  etdump_ETDump_run_data_start(builder);
-  etdump_ETDump_run_data_push_start(builder);
+  state_ = State::Init;
+  num_blocks_ = 0;
+  flatcc_builder_reset(builder_);
+  flatbuffers_buffer_start(builder_, etdump_ETDump_file_identifier);
+  etdump_ETDump_start_as_root_with_size(builder_);
+  etdump_ETDump_version_add(builder_, ETDUMP_VERSION);
+  etdump_ETDump_run_data_start(builder_);
+  etdump_ETDump_run_data_push_start(builder_);
 }
 
 void ETDumpGen::create_event_block(const char* name) {
-  if (etdump_gen_state == ETDumpGen_Adding_Events) {
-    etdump_RunData_events_end(builder);
-  } else if (etdump_gen_state == ETDumpGen_Done) {
+  if (state_ == State::AddingEvents) {
+    etdump_RunData_events_end(builder_);
+  } else if (state_ == State::Done) {
     reset();
   }
-  if (num_blocks > 0) {
-    etdump_ETDump_run_data_push_end(builder);
-    etdump_ETDump_run_data_push_start(builder);
+  if (num_blocks_ > 0) {
+    etdump_ETDump_run_data_push_end(builder_);
+    etdump_ETDump_run_data_push_start(builder_);
   }
-  ++num_blocks;
-  etdump_RunData_name_create_strn(builder, name, strlen(name));
-  if (bundled_input_index != -1) {
-    etdump_RunData_bundled_input_index_add(builder, bundled_input_index);
+  ++num_blocks_;
+  etdump_RunData_name_create_strn(builder_, name, strlen(name));
+  if (bundled_input_index_ != -1) {
+    etdump_RunData_bundled_input_index_add(builder_, bundled_input_index_);
   }
-  etdump_gen_state = ETDumpGen_Block_Created;
+  state_ = State::BlockCreated;
 }
 
 int64_t ETDumpGen::create_string_entry(const char* name) {
-  return flatbuffers_string_create_str(builder, name);
+  return flatbuffers_string_create_str(builder_, name);
 }
 
 // ETDumpGen has the following possible states, ETDumpGen_Init,
@@ -169,16 +183,15 @@ int64_t ETDumpGen::create_string_entry(const char* name) {
 // type again. In this case once we close the allocators table and start pushing
 // to the events table we cannot push to the allocators table again.
 void ETDumpGen::check_ready_to_add_events() {
-  if (etdump_gen_state != ETDumpGen_Adding_Events) {
+  if (state_ != State::AddingEvents) {
     ET_CHECK_MSG(
-        (etdump_gen_state == ETDumpGen_Adding_Allocators ||
-         etdump_gen_state == ETDumpGen_Block_Created),
+        (state_ == State::AddingAllocators || state_ == State::BlockCreated),
         "ETDumpGen in an invalid state. Cannot add new events now.");
-    if (etdump_gen_state == ETDumpGen_Adding_Allocators) {
-      etdump_RunData_allocators_end(builder);
+    if (state_ == State::AddingAllocators) {
+      etdump_RunData_allocators_end(builder_);
     }
-    etdump_RunData_events_start(builder);
-    etdump_gen_state = ETDumpGen_Adding_Events;
+    etdump_RunData_events_start(builder_);
+    state_ = State::AddingEvents;
   }
 }
 
@@ -231,29 +244,29 @@ void ETDumpGen::end_profiling_delegate(
   check_ready_to_add_events();
 
   // Start building the ProfileEvent entry.
-  etdump_ProfileEvent_start(builder);
-  etdump_ProfileEvent_start_time_add(builder, event_tracer_entry.start_time);
-  etdump_ProfileEvent_end_time_add(builder, end_time);
-  etdump_ProfileEvent_chain_index_add(builder, chain_id_);
-  etdump_ProfileEvent_instruction_id_add(builder, debug_handle_);
+  etdump_ProfileEvent_start(builder_);
+  etdump_ProfileEvent_start_time_add(builder_, event_tracer_entry.start_time);
+  etdump_ProfileEvent_end_time_add(builder_, end_time);
+  etdump_ProfileEvent_chain_index_add(builder_, chain_id_);
+  etdump_ProfileEvent_instruction_id_add(builder_, debug_handle_);
   // Delegate debug identifier can either be of a string type or an integer
   // type. If it's a string type then it's a value of type
   // flatbuffers_string_ref_t type, whereas if it's an integer type then we
   // write the integer value directly.
   if (event_tracer_entry.delegate_event_id_type == DelegateDebugIdType::kInt) {
     etdump_ProfileEvent_delegate_debug_id_int_add(
-        builder, event_tracer_entry.event_id);
+        builder_, event_tracer_entry.event_id);
   } else {
     etdump_ProfileEvent_delegate_debug_id_str_add(
-        builder, event_tracer_entry.event_id);
+        builder_, event_tracer_entry.event_id);
   }
   flatbuffers_uint8_vec_ref_t vec_ref = flatbuffers_uint8_vec_create_pe(
-      builder, (const uint8_t*)metadata, metadata_len);
-  etdump_ProfileEvent_delegate_debug_metadata_add(builder, vec_ref);
-  etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder);
-  etdump_RunData_events_push_start(builder);
-  etdump_Event_profile_event_add(builder, id);
-  etdump_RunData_events_push_end(builder);
+      builder_, (const uint8_t*)metadata, metadata_len);
+  etdump_ProfileEvent_delegate_debug_metadata_add(builder_, vec_ref);
+  etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_);
+  etdump_RunData_events_push_start(builder_);
+  etdump_Event_profile_event_add(builder_, id);
+  etdump_RunData_events_push_end(builder_);
 }
 
 void ETDumpGen::log_profiling_delegate(
@@ -268,24 +281,24 @@ void ETDumpGen::log_profiling_delegate(
       "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details.");
   check_ready_to_add_events();
   int64_t string_id = name != nullptr ? create_string_entry(name) : -1;
-  etdump_ProfileEvent_start(builder);
-  etdump_ProfileEvent_start_time_add(builder, start_time);
-  etdump_ProfileEvent_end_time_add(builder, end_time);
-  etdump_ProfileEvent_chain_index_add(builder, chain_id_);
-  etdump_ProfileEvent_instruction_id_add(builder, debug_handle_);
+  etdump_ProfileEvent_start(builder_);
+  etdump_ProfileEvent_start_time_add(builder_, start_time);
+  etdump_ProfileEvent_end_time_add(builder_, end_time);
+  etdump_ProfileEvent_chain_index_add(builder_, chain_id_);
+  etdump_ProfileEvent_instruction_id_add(builder_, debug_handle_);
   if (string_id == -1) {
     etdump_ProfileEvent_delegate_debug_id_int_add(
-        builder, delegate_debug_index);
+        builder_, delegate_debug_index);
   } else {
-    etdump_ProfileEvent_delegate_debug_id_str_add(builder, string_id);
+    etdump_ProfileEvent_delegate_debug_id_str_add(builder_, string_id);
   }
   flatbuffers_uint8_vec_ref_t vec_ref = flatbuffers_uint8_vec_create_pe(
-      builder, (const uint8_t*)metadata, metadata_len);
-  etdump_ProfileEvent_delegate_debug_metadata_add(builder, vec_ref);
-  etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder);
-  etdump_RunData_events_push_start(builder);
-  etdump_Event_profile_event_add(builder, id);
-  etdump_RunData_events_push_end(builder);
+      builder_, (const uint8_t*)metadata, metadata_len);
+  etdump_ProfileEvent_delegate_debug_metadata_add(builder_, vec_ref);
+  etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_);
+  etdump_RunData_events_push_start(builder_);
+  etdump_Event_profile_event_add(builder_, id);
+  etdump_RunData_events_push_end(builder_);
 }
 
 void ETDumpGen::log_intermediate_output_delegate(
@@ -331,7 +344,7 @@ void ETDumpGen::log_intermediate_output_delegate_helper(
   ET_CHECK_MSG(
       (name == nullptr) ^ (delegate_debug_index == -1),
       "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details.");
-  if (debug_buffer.empty()) {
+  if (debug_buffer_.empty()) {
     ET_CHECK_MSG(0, "Must pre-set debug buffer with set_debug_buffer()\n");
     return;
   }
@@ -339,71 +352,71 @@ void ETDumpGen::log_intermediate_output_delegate_helper(
   check_ready_to_add_events();
   int64_t string_id = name != nullptr ? create_string_entry(name) : -1;
 
-  etdump_DebugEvent_start(builder);
+  etdump_DebugEvent_start(builder_);
 
-  etdump_DebugEvent_chain_index_add(builder, chain_id_);
-  etdump_DebugEvent_instruction_id_add(builder, debug_handle_);
+  etdump_DebugEvent_chain_index_add(builder_, chain_id_);
+  etdump_DebugEvent_instruction_id_add(builder_, debug_handle_);
   if (string_id == -1) {
-    etdump_DebugEvent_delegate_debug_id_int_add(builder, delegate_debug_index);
+    etdump_DebugEvent_delegate_debug_id_int_add(builder_, delegate_debug_index);
   } else {
-    etdump_DebugEvent_delegate_debug_id_str_add(builder, string_id);
+    etdump_DebugEvent_delegate_debug_id_str_add(builder_, string_id);
   }
 
   // Check the type of `output` then call the corresponding logging functions
   if constexpr (std::is_same<T, Tensor>::value) {
     long offset = copy_tensor_to_debug_buffer(output);
-    etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder, output, offset);
+    etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder_, output, offset);
 
-    etdump_Value_start(builder);
-    etdump_Value_val_add(builder, etdump_ValueType_Tensor);
-    etdump_Value_tensor_add(builder, tensor_ref);
+    etdump_Value_start(builder_);
+    etdump_Value_val_add(builder_, etdump_ValueType_Tensor);
+    etdump_Value_tensor_add(builder_, tensor_ref);
 
   } else if constexpr (std::is_same<T, ArrayRef<Tensor>>::value) {
-    etdump_Tensor_vec_start(builder);
+    etdump_Tensor_vec_start(builder_);
     for (size_t i = 0; i < output.size(); ++i) {
       long offset = copy_tensor_to_debug_buffer(output[i]);
       etdump_Tensor_vec_push(
-          builder, add_tensor_entry(builder, output[i], offset));
+          builder_, add_tensor_entry(builder_, output[i], offset));
     }
-    etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder);
+    etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_);
     etdump_TensorList_ref_t tensor_list_ref =
-        etdump_TensorList_create(builder, tensor_vec_ref);
+        etdump_TensorList_create(builder_, tensor_vec_ref);
 
-    etdump_Value_start(builder);
-    etdump_Value_val_add(builder, etdump_ValueType_TensorList);
-    etdump_Value_tensor_list_add(builder, tensor_list_ref);
+    etdump_Value_start(builder_);
+    etdump_Value_val_add(builder_, etdump_ValueType_TensorList);
+    etdump_Value_tensor_list_add(builder_, tensor_list_ref);
   } else if constexpr (std::is_same<T, int>::value) {
-    auto int_ref = etdump_Int_create(builder, output);
+    auto int_ref = etdump_Int_create(builder_, output);
 
-    etdump_Value_start(builder);
-    etdump_Value_val_add(builder, etdump_ValueType_Int);
-    etdump_Value_int_value_add(builder, int_ref);
+    etdump_Value_start(builder_);
+    etdump_Value_val_add(builder_, etdump_ValueType_Int);
+    etdump_Value_int_value_add(builder_, int_ref);
   } else if constexpr (std::is_same<T, double>::value) {
-    auto double_ref = etdump_Double_create(builder, output);
+    auto double_ref = etdump_Double_create(builder_, output);
 
-    etdump_Value_start(builder);
-    etdump_Value_double_value_add(builder, double_ref);
-    etdump_Value_val_add(builder, etdump_ValueType_Double);
+    etdump_Value_start(builder_);
+    etdump_Value_double_value_add(builder_, double_ref);
+    etdump_Value_val_add(builder_, etdump_ValueType_Double);
   } else if constexpr (std::is_same<T, bool>::value) {
     flatbuffers_bool_t flatbuffer_bool_val =
         output ? FLATBUFFERS_TRUE : FLATBUFFERS_FALSE;
-    auto bool_ref = etdump_Bool_create(builder, flatbuffer_bool_val);
+    auto bool_ref = etdump_Bool_create(builder_, flatbuffer_bool_val);
 
-    etdump_Value_start(builder);
-    etdump_Value_bool_value_add(builder, bool_ref);
-    etdump_Value_val_add(builder, etdump_ValueType_Bool);
+    etdump_Value_start(builder_);
+    etdump_Value_bool_value_add(builder_, bool_ref);
+    etdump_Value_val_add(builder_, etdump_ValueType_Bool);
   } else {
     ET_CHECK_MSG(0, "Unsupported output type for intermediate logging\n");
   }
 
-  auto value_ref = etdump_Value_end(builder);
-  etdump_DebugEvent_debug_entry_add(builder, value_ref);
+  auto value_ref = etdump_Value_end(builder_);
+  etdump_DebugEvent_debug_entry_add(builder_, value_ref);
 
-  etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder);
+  etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder_);
 
-  etdump_RunData_events_push_start(builder);
-  etdump_Event_debug_event_add(builder, debug_event);
-  etdump_RunData_events_push_end(builder);
+  etdump_RunData_events_push_start(builder_);
+  etdump_Event_debug_event_add(builder_, debug_event);
+  etdump_RunData_events_push_end(builder_);
 }
 
 void ETDumpGen::end_profiling(EventTracerEntry prof_entry) {
@@ -413,32 +426,31 @@ void ETDumpGen::end_profiling(EventTracerEntry prof_entry) {
       "Delegate events must use end_profiling_delegate to mark the end of a delegate profiling event.");
   check_ready_to_add_events();
 
-  etdump_ProfileEvent_start(builder);
-  etdump_ProfileEvent_start_time_add(builder, prof_entry.start_time);
-  etdump_ProfileEvent_end_time_add(builder, end_time);
-  etdump_ProfileEvent_chain_index_add(builder, prof_entry.chain_id);
-  etdump_ProfileEvent_instruction_id_add(builder, prof_entry.debug_handle);
+  etdump_ProfileEvent_start(builder_);
+  etdump_ProfileEvent_start_time_add(builder_, prof_entry.start_time);
+  etdump_ProfileEvent_end_time_add(builder_, end_time);
+  etdump_ProfileEvent_chain_index_add(builder_, prof_entry.chain_id);
+  etdump_ProfileEvent_instruction_id_add(builder_, prof_entry.debug_handle);
   if (prof_entry.event_id != -1) {
-    etdump_ProfileEvent_name_add(builder, prof_entry.event_id);
+    etdump_ProfileEvent_name_add(builder_, prof_entry.event_id);
   }
-  etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder);
-  etdump_RunData_events_push_start(builder);
-  etdump_Event_profile_event_add(builder, id);
-  etdump_RunData_events_push_end(builder);
+  etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_);
+  etdump_RunData_events_push_start(builder_);
+  etdump_Event_profile_event_add(builder_, id);
+  etdump_RunData_events_push_end(builder_);
 }
 
 AllocatorID ETDumpGen::track_allocator(const char* name) {
   ET_CHECK_MSG(
-      (etdump_gen_state == ETDumpGen_Block_Created ||
-       etdump_gen_state == ETDumpGen_Adding_Allocators),
+      (state_ == State::BlockCreated || state_ == State::AddingAllocators),
       "Allocators can only be added immediately after a new block is created and before any events are added.");
-  if (etdump_gen_state != ETDumpGen_Adding_Allocators) {
-    etdump_RunData_allocators_start(builder);
-    etdump_gen_state = ETDumpGen_Adding_Allocators;
+  if (state_ != State::AddingAllocators) {
+    etdump_RunData_allocators_start(builder_);
+    state_ = State::AddingAllocators;
   }
   flatbuffers_string_ref_t ref = create_string_entry(name);
-  etdump_RunData_allocators_push_create(builder, ref);
-  return etdump_RunData_allocators_reserved_len(builder);
+  etdump_RunData_allocators_push_create(builder_, ref);
+  return etdump_RunData_allocators_reserved_len(builder_);
 }
 
 void ETDumpGen::track_allocation(
@@ -446,43 +458,43 @@ void ETDumpGen::track_allocation(
     size_t allocation_size) {
   check_ready_to_add_events();
 
-  etdump_RunData_events_push_start(builder);
-  etdump_Event_allocation_event_create(builder, allocator_id, allocation_size);
-  etdump_RunData_events_push_end(builder);
+  etdump_RunData_events_push_start(builder_);
+  etdump_Event_allocation_event_create(builder_, allocator_id, allocation_size);
+  etdump_RunData_events_push_end(builder_);
 }
 
-etdump_result ETDumpGen::get_etdump_data() {
-  etdump_result result;
-  if (etdump_gen_state == ETDumpGen_Adding_Events) {
-    etdump_RunData_events_end(builder);
-  } else if (etdump_gen_state == ETDumpGen_Adding_Allocators) {
-    etdump_RunData_allocators_end(builder);
-  } else if (etdump_gen_state == ETDumpGen_Init) {
+ETDumpResult ETDumpGen::get_etdump_data() {
+  ETDumpResult result;
+  if (state_ == State::AddingEvents) {
+    etdump_RunData_events_end(builder_);
+  } else if (state_ == State::AddingAllocators) {
+    etdump_RunData_allocators_end(builder_);
+  } else if (state_ == State::Init) {
     result.buf = nullptr;
     result.size = 0;
     return result;
   }
-  etdump_ETDump_run_data_push_end(builder);
-  etdump_ETDump_run_data_end(builder);
-  etdump_ETDump_ref_t root = etdump_ETDump_end(builder);
-  flatbuffers_buffer_end(builder, root);
-  if (num_blocks == 0) {
+  etdump_ETDump_run_data_push_end(builder_);
+  etdump_ETDump_run_data_end(builder_);
+  etdump_ETDump_ref_t root = etdump_ETDump_end(builder_);
+  flatbuffers_buffer_end(builder_, root);
+  if (num_blocks_ == 0) {
     result = {nullptr, 0};
   } else {
-    if (alloc.data) {
-      result.buf = alloc.front_cursor;
-      result.size = alloc.out_size - alloc.front_left;
+    if (alloc_.data) {
+      result.buf = alloc_.front_cursor;
+      result.size = alloc_.out_size - alloc_.front_left;
     } else {
       result.buf =
-          flatcc_builder_finalize_aligned_buffer(builder, &result.size);
+          flatcc_builder_finalize_aligned_buffer(builder_, &result.size);
     }
   }
-  etdump_gen_state = ETDumpGen_Done;
+  state_ = State::Done;
   return result;
 }
 
 void ETDumpGen::set_debug_buffer(Span<uint8_t> buffer) {
-  debug_buffer = buffer;
+  debug_buffer_ = buffer;
 }
 
 size_t ETDumpGen::copy_tensor_to_debug_buffer(exec_aten::Tensor tensor) {
@@ -490,94 +502,94 @@ size_t ETDumpGen::copy_tensor_to_debug_buffer(exec_aten::Tensor tensor) {
     return static_cast<size_t>(-1);
   }
   uint8_t* offset_ptr =
-      alignPointer(debug_buffer.data() + debug_buffer_offset, 64);
-  debug_buffer_offset = (offset_ptr - debug_buffer.data()) + tensor.nbytes();
+      alignPointer(debug_buffer_.data() + debug_buffer_offset_, 64);
+  debug_buffer_offset_ = (offset_ptr - debug_buffer_.data()) + tensor.nbytes();
   ET_CHECK_MSG(
-      debug_buffer_offset <= debug_buffer.size(),
+      debug_buffer_offset_ <= debug_buffer_.size(),
       "Ran out of space to store intermediate outputs.");
   memcpy(offset_ptr, tensor.const_data_ptr(), tensor.nbytes());
-  return (size_t)(offset_ptr - debug_buffer.data());
+  return (size_t)(offset_ptr - debug_buffer_.data());
 }
 
 void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
-  if (debug_buffer.empty()) {
+  if (debug_buffer_.empty()) {
     return;
   }
 
   check_ready_to_add_events();
 
-  etdump_DebugEvent_start(builder);
+  etdump_DebugEvent_start(builder_);
 
-  etdump_DebugEvent_chain_index_add(builder, chain_id_);
-  etdump_DebugEvent_instruction_id_add(builder, debug_handle_);
+  etdump_DebugEvent_chain_index_add(builder_, chain_id_);
+  etdump_DebugEvent_instruction_id_add(builder_, debug_handle_);
 
   switch (evalue.tag) {
     case Tag::Tensor: {
       exec_aten::Tensor tensor = evalue.toTensor();
       long offset = copy_tensor_to_debug_buffer(tensor);
       etdump_Tensor_ref_t tensor_ref =
-          add_tensor_entry(builder, tensor, offset);
+          add_tensor_entry(builder_, tensor, offset);
 
-      etdump_Value_start(builder);
-      etdump_Value_val_add(builder, etdump_ValueType_Tensor);
-      etdump_Value_tensor_add(builder, tensor_ref);
+      etdump_Value_start(builder_);
+      etdump_Value_val_add(builder_, etdump_ValueType_Tensor);
+      etdump_Value_tensor_add(builder_, tensor_ref);
       if (evalue_type == LoggedEValueType::kProgramOutput) {
-        auto bool_ref = etdump_Bool_create(builder, FLATBUFFERS_TRUE);
-        etdump_Value_output_add(builder, bool_ref);
+        auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE);
+        etdump_Value_output_add(builder_, bool_ref);
       }
-      auto value_ref = etdump_Value_end(builder);
+      auto value_ref = etdump_Value_end(builder_);
 
-      etdump_DebugEvent_debug_entry_add(builder, value_ref);
+      etdump_DebugEvent_debug_entry_add(builder_, value_ref);
       break;
     }
 
     case Tag::ListTensor: {
       exec_aten::ArrayRef<exec_aten::Tensor> tensors = evalue.toTensorList();
-      etdump_Tensor_vec_start(builder);
+      etdump_Tensor_vec_start(builder_);
       for (size_t i = 0; i < tensors.size(); ++i) {
         long offset = copy_tensor_to_debug_buffer(tensors[i]);
         etdump_Tensor_vec_push(
-            builder, add_tensor_entry(builder, tensors[i], offset));
+            builder_, add_tensor_entry(builder_, tensors[i], offset));
       }
-      etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder);
+      etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_);
       etdump_TensorList_ref_t tensor_list_ref =
-          etdump_TensorList_create(builder, tensor_vec_ref);
+          etdump_TensorList_create(builder_, tensor_vec_ref);
 
-      etdump_Value_start(builder);
-      etdump_Value_val_add(builder, etdump_ValueType_TensorList);
-      etdump_Value_tensor_list_add(builder, tensor_list_ref);
+      etdump_Value_start(builder_);
+      etdump_Value_val_add(builder_, etdump_ValueType_TensorList);
+      etdump_Value_tensor_list_add(builder_, tensor_list_ref);
       if (evalue_type == LoggedEValueType::kProgramOutput) {
-        auto bool_ref = etdump_Bool_create(builder, FLATBUFFERS_TRUE);
-        etdump_Value_output_add(builder, bool_ref);
+        auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE);
+        etdump_Value_output_add(builder_, bool_ref);
       }
-      auto value_ref = etdump_Value_end(builder);
+      auto value_ref = etdump_Value_end(builder_);
 
-      etdump_DebugEvent_debug_entry_add(builder, value_ref);
+      etdump_DebugEvent_debug_entry_add(builder_, value_ref);
       break;
     }
 
     case Tag::Int: {
       int64_t val = evalue.toInt();
-      auto int_ref = etdump_Int_create(builder, val);
+      auto int_ref = etdump_Int_create(builder_, val);
 
-      etdump_Value_start(builder);
-      etdump_Value_val_add(builder, etdump_ValueType_Int);
-      etdump_Value_int_value_add(builder, int_ref);
-      auto value_ref = etdump_Value_end(builder);
-      etdump_DebugEvent_debug_entry_add(builder, value_ref);
+      etdump_Value_start(builder_);
+      etdump_Value_val_add(builder_, etdump_ValueType_Int);
+      etdump_Value_int_value_add(builder_, int_ref);
+      auto value_ref = etdump_Value_end(builder_);
+      etdump_DebugEvent_debug_entry_add(builder_, value_ref);
 
       break;
     }
 
     case Tag::Double: {
       double val = evalue.toDouble();
-      auto double_ref = etdump_Double_create(builder, val);
+      auto double_ref = etdump_Double_create(builder_, val);
 
-      etdump_Value_start(builder);
-      etdump_Value_double_value_add(builder, double_ref);
-      etdump_Value_val_add(builder, etdump_ValueType_Double);
-      auto value_ref = etdump_Value_end(builder);
-      etdump_DebugEvent_debug_entry_add(builder, value_ref);
+      etdump_Value_start(builder_);
+      etdump_Value_double_value_add(builder_, double_ref);
+      etdump_Value_val_add(builder_, etdump_ValueType_Double);
+      auto value_ref = etdump_Value_end(builder_);
+      etdump_DebugEvent_debug_entry_add(builder_, value_ref);
 
       break;
     }
@@ -585,13 +597,13 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
     case Tag::Bool: {
       flatbuffers_bool_t flatbuffer_bool_val =
           evalue.toBool() ? FLATBUFFERS_TRUE : FLATBUFFERS_FALSE;
-      auto bool_ref = etdump_Bool_create(builder, flatbuffer_bool_val);
+      auto bool_ref = etdump_Bool_create(builder_, flatbuffer_bool_val);
 
-      etdump_Value_start(builder);
-      etdump_Value_bool_value_add(builder, bool_ref);
-      etdump_Value_val_add(builder, etdump_ValueType_Bool);
-      auto value_ref = etdump_Value_end(builder);
-      etdump_DebugEvent_debug_entry_add(builder, value_ref);
+      etdump_Value_start(builder_);
+      etdump_Value_bool_value_add(builder_, bool_ref);
+      etdump_Value_val_add(builder_, etdump_ValueType_Bool);
+      auto value_ref = etdump_Value_end(builder_);
+      etdump_DebugEvent_debug_entry_add(builder_, value_ref);
 
       break;
     }
@@ -604,20 +616,20 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
       break;
   }
 
-  etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder);
+  etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder_);
 
-  etdump_RunData_events_push_start(builder);
-  etdump_Event_debug_event_add(builder, debug_event);
-  etdump_RunData_events_push_end(builder);
+  etdump_RunData_events_push_start(builder_);
+  etdump_Event_debug_event_add(builder_, debug_event);
+  etdump_RunData_events_push_end(builder_);
 }
 
 size_t ETDumpGen::get_num_blocks() {
-  return num_blocks;
+  return num_blocks_;
 }
 
 bool ETDumpGen::is_static_etdump() {
-  return alloc.data != nullptr;
+  return alloc_.data != nullptr;
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace etdump
+} // namespace executorch
diff --git a/sdk/etdump/etdump_flatcc.h b/devtools/etdump/etdump_flatcc.h
similarity index 53%
rename from sdk/etdump/etdump_flatcc.h
rename to devtools/etdump/etdump_flatcc.h
index e56d09f8107..0bd891a0970 100644
--- a/sdk/etdump/etdump_flatcc.h
+++ b/devtools/etdump/etdump_flatcc.h
@@ -8,33 +8,22 @@
 
 #pragma once
 
-#include <executorch/runtime/core/span.h>
 #include <cstdint>
-#include "executorch/runtime/core/event_tracer.h"
-#include "executorch/runtime/platform/platform.h"
+
+#include <executorch/runtime/core/event_tracer.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/platform/platform.h>
 
 #define ETDUMP_VERSION 0
 
 struct flatcc_builder;
 
-namespace torch {
-namespace executor {
-
-enum ETDumpGen_State {
-  ETDumpGen_Init,
-  ETDumpGen_Block_Created,
-  ETDumpGen_Adding_Allocators,
-  ETDumpGen_Adding_Events,
-  ETDumpGen_Done,
-};
+namespace executorch {
+namespace etdump {
 
-struct etdump_result {
-  void* buf;
-  size_t size;
-};
-
-struct etdump_static_allocator {
-  etdump_static_allocator() {}
+namespace internal {
+struct ETDumpStaticAllocator {
+  ETDumpStaticAllocator() = default;
 
   void
   set_buffer(uint8_t* buffer, size_t total_buf_size, size_t alloc_buf_size) {
@@ -64,61 +53,72 @@ struct etdump_static_allocator {
   // Bytes left in front of front_cursor.
   size_t front_left{0};
 };
+} // namespace internal
+
+struct ETDumpResult {
+  void* buf;
+  size_t size;
+};
 
-class ETDumpGen : public EventTracer {
+class ETDumpGen : public ::executorch::runtime::EventTracer {
  public:
-  ETDumpGen(Span<uint8_t> buffer = {nullptr, (size_t)0});
+  ETDumpGen(::executorch::runtime::Span<uint8_t> buffer = {nullptr, (size_t)0});
   ~ETDumpGen() override;
   void clear_builder();
 
   void create_event_block(const char* name) override;
-  virtual EventTracerEntry start_profiling(
+  virtual ::executorch::runtime::EventTracerEntry start_profiling(
       const char* name,
-      ChainID chain_id = -1,
-      DebugHandle debug_handle = 0) override;
-  virtual void end_profiling(EventTracerEntry prof_entry) override;
-  virtual EventTracerEntry start_profiling_delegate(
+      ::executorch::runtime::ChainID chain_id = -1,
+      ::executorch::runtime::DebugHandle debug_handle = 0) override;
+  virtual void end_profiling(
+      ::executorch::runtime::EventTracerEntry prof_entry) override;
+  virtual ::executorch::runtime::EventTracerEntry start_profiling_delegate(
       const char* name,
-      DebugHandle delegate_debug_index) override;
+      ::executorch::runtime::DebugHandle delegate_debug_index) override;
   virtual void end_profiling_delegate(
-      EventTracerEntry prof_entry,
+      ::executorch::runtime::EventTracerEntry prof_entry,
       const void* metadata,
       size_t metadata_len) override;
   virtual void log_profiling_delegate(
       const char* name,
-      DebugHandle delegate_debug_index,
+      ::executorch::runtime::DebugHandle delegate_debug_index,
       et_timestamp_t start_time,
       et_timestamp_t end_time,
       const void* metadata,
       size_t metadata_len) override;
-  virtual void track_allocation(AllocatorID id, size_t size) override;
-  virtual AllocatorID track_allocator(const char* name) override;
+  virtual void track_allocation(
+      ::executorch::runtime::AllocatorID id,
+      size_t size) override;
+  virtual ::executorch::runtime::AllocatorID track_allocator(
+      const char* name) override;
   virtual void log_evalue(
-      const EValue& evalue,
-      LoggedEValueType evalue_type =
-          LoggedEValueType::kIntermediateOutput) override;
+      const ::executorch::runtime::EValue& evalue,
+      ::executorch::runtime::LoggedEValueType evalue_type =
+          ::executorch::runtime::LoggedEValueType::kIntermediateOutput)
+      override;
   /**
    * Log an intermediate tensor output from a delegate.
    */
   virtual void log_intermediate_output_delegate(
       const char* name,
-      DebugHandle delegate_debug_index,
-      const Tensor& output) override;
+      ::executorch::runtime::DebugHandle delegate_debug_index,
+      const exec_aten::Tensor& output) override;
 
   /**
    * Log an intermediate tensor array output from a delegate.
    */
   virtual void log_intermediate_output_delegate(
       const char* name,
-      DebugHandle delegate_debug_index,
-      const ArrayRef<Tensor> output) override;
+      ::executorch::runtime::DebugHandle delegate_debug_index,
+      const ::executorch::runtime::ArrayRef<exec_aten::Tensor> output) override;
 
   /**
    * Log an intermediate int output from a delegate.
    */
   virtual void log_intermediate_output_delegate(
       const char* name,
-      DebugHandle delegate_debug_index,
+      ::executorch::runtime::DebugHandle delegate_debug_index,
       const int& output) override;
 
   /**
@@ -126,7 +126,7 @@ class ETDumpGen : public EventTracer {
    */
   virtual void log_intermediate_output_delegate(
       const char* name,
-      DebugHandle delegate_debug_index,
+      ::executorch::runtime::DebugHandle delegate_debug_index,
       const bool& output) override;
 
   /**
@@ -134,22 +134,22 @@ class ETDumpGen : public EventTracer {
    */
   virtual void log_intermediate_output_delegate(
       const char* name,
-      DebugHandle delegate_debug_index,
+      ::executorch::runtime::DebugHandle delegate_debug_index,
       const double& output) override;
-  void set_debug_buffer(Span<uint8_t> buffer);
-  etdump_result get_etdump_data();
+  void set_debug_buffer(::executorch::runtime::Span<uint8_t> buffer);
+  ETDumpResult get_etdump_data();
   size_t get_num_blocks();
   bool is_static_etdump();
   void reset();
 
  private:
-  struct flatcc_builder* builder;
-  size_t num_blocks = 0;
-  Span<uint8_t> debug_buffer;
-  size_t debug_buffer_offset = 0;
-  int bundled_input_index = -1;
-  ETDumpGen_State etdump_gen_state = ETDumpGen_Init;
-  struct etdump_static_allocator alloc;
+  enum class State {
+    Init,
+    BlockCreated,
+    AddingAllocators,
+    AddingEvents,
+    Done,
+  };
 
   void check_ready_to_add_events();
   int64_t create_string_entry(const char* name);
@@ -162,9 +162,26 @@ class ETDumpGen : public EventTracer {
   template <typename T>
   void log_intermediate_output_delegate_helper(
       const char* name,
-      DebugHandle delegate_debug_index,
+      ::executorch::runtime::DebugHandle delegate_debug_index,
       const T& output);
+
+  struct flatcc_builder* builder_;
+  size_t num_blocks_ = 0;
+  ::executorch::runtime::Span<uint8_t> debug_buffer_;
+  size_t debug_buffer_offset_ = 0;
+  int bundled_input_index_ = -1;
+  State state_ = State::Init;
+  struct internal::ETDumpStaticAllocator alloc_;
 };
 
+} // namespace etdump
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using etdump_result = ::executorch::etdump::ETDumpResult;
+using ::executorch::etdump::ETDumpGen;
 } // namespace executor
 } // namespace torch
diff --git a/sdk/etdump/etdump_schema_flatcc.fbs b/devtools/etdump/etdump_schema_flatcc.fbs
similarity index 96%
rename from sdk/etdump/etdump_schema_flatcc.fbs
rename to devtools/etdump/etdump_schema_flatcc.fbs
index d90d278f5fc..1244ebd4aeb 100644
--- a/sdk/etdump/etdump_schema_flatcc.fbs
+++ b/devtools/etdump/etdump_schema_flatcc.fbs
@@ -76,6 +76,10 @@ table DebugEvent {
 
   // String based delegate debug identifier.
   delegate_debug_id_str:string;
+
+  // Name assigned to this debug event by the runtime. If it is an operator
+  // call this will just be the name of the operator that was executed.
+  name:string;
 }
 
 // All the details pertaining to an allocation done in the runtime. The main
diff --git a/sdk/etdump/scalar_type.fbs b/devtools/etdump/scalar_type.fbs
similarity index 97%
rename from sdk/etdump/scalar_type.fbs
rename to devtools/etdump/scalar_type.fbs
index fdfe550e9e3..a8da080c679 100644
--- a/sdk/etdump/scalar_type.fbs
+++ b/devtools/etdump/scalar_type.fbs
@@ -14,6 +14,7 @@ enum ScalarType : byte {
   SHORT = 2,
   INT = 3,
   LONG = 4,
+  HALF = 5,
   FLOAT = 6,
   DOUBLE = 7,
   BOOL = 11,
@@ -24,7 +25,6 @@ enum ScalarType : byte {
   QUINT4X2 = 16,
   QUINT2X4 = 17,
   // Types currently not implemented.
-  // Half = 5,
   // COMPLEXHALF = 8,
   // COMPLEXFLOAT = 9,
   // COMPLEXDOUBLE = 10,
diff --git a/sdk/etdump/schema_flatcc.py b/devtools/etdump/schema_flatcc.py
similarity index 96%
rename from sdk/etdump/schema_flatcc.py
rename to devtools/etdump/schema_flatcc.py
index eaad876a536..404fa1c9758 100644
--- a/sdk/etdump/schema_flatcc.py
+++ b/devtools/etdump/schema_flatcc.py
@@ -7,7 +7,7 @@
 # pyre-strict
 """
 This file is the python representation of the schema contained in
-executorch/sdk/etdump/etdump_schema.fbs. Any changes made to that
+executorch/devtools/etdump/etdump_schema.fbs. Any changes made to that
 flatbuffer schema should accordingly be reflected here also.
 """
 
@@ -93,6 +93,7 @@ class Value:
 
 @dataclass
 class DebugEvent:
+    name: Optional[str]
     chain_index: int
     instruction_id: int
     delegate_debug_id_int: Optional[int]
diff --git a/sdk/etdump/serialize.py b/devtools/etdump/serialize.py
similarity index 98%
rename from sdk/etdump/serialize.py
rename to devtools/etdump/serialize.py
index 0cc6682bfcb..4ed63bc385b 100644
--- a/sdk/etdump/serialize.py
+++ b/devtools/etdump/serialize.py
@@ -11,11 +11,11 @@
 import tempfile
 
 import pkg_resources
+from executorch.devtools.etdump.schema_flatcc import ETDumpFlatCC
 
 from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
 
 from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile
-from executorch.sdk.etdump.schema_flatcc import ETDumpFlatCC
 
 # The prefix of schema files used for etdump
 ETDUMP_FLATCC_SCHEMA_NAME = "etdump_schema_flatcc"
diff --git a/sdk/etdump/targets.bzl b/devtools/etdump/targets.bzl
similarity index 99%
rename from sdk/etdump/targets.bzl
rename to devtools/etdump/targets.bzl
index 6d548ce650f..ddbb35eab74 100644
--- a/sdk/etdump/targets.bzl
+++ b/devtools/etdump/targets.bzl
@@ -95,9 +95,11 @@ def define_common_targets():
                 "etdump_flatcc.cpp",
                 "emitter.cpp",
             ],
+            headers = [
+                "emitter.h",
+            ],
             exported_headers = [
                 "etdump_flatcc.h",
-                "emitter.h",
             ],
             deps = [
                 "//executorch/runtime/platform:platform",
diff --git a/sdk/etdump/tests/CMakeLists.txt b/devtools/etdump/tests/CMakeLists.txt
similarity index 100%
rename from sdk/etdump/tests/CMakeLists.txt
rename to devtools/etdump/tests/CMakeLists.txt
diff --git a/sdk/etdump/tests/TARGETS b/devtools/etdump/tests/TARGETS
similarity index 75%
rename from sdk/etdump/tests/TARGETS
rename to devtools/etdump/tests/TARGETS
index ad48948c48a..51e807891df 100644
--- a/sdk/etdump/tests/TARGETS
+++ b/devtools/etdump/tests/TARGETS
@@ -11,8 +11,8 @@ python_unittest(
         "serialize_test.py",
     ],
     deps = [
+        "//executorch/devtools/etdump:schema_flatcc",
+        "//executorch/devtools/etdump:serialize",
         "//executorch/exir/_serialize:lib",
-        "//executorch/sdk/etdump:schema_flatcc",
-        "//executorch/sdk/etdump:serialize",
     ],
 )
diff --git a/sdk/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp
similarity index 94%
rename from sdk/etdump/tests/etdump_test.cpp
rename to devtools/etdump/tests/etdump_test.cpp
index d30bd9a3037..b750e21eb07 100644
--- a/sdk/etdump/tests/etdump_test.cpp
+++ b/devtools/etdump/tests/etdump_test.cpp
@@ -9,19 +9,31 @@
 #include <gtest/gtest.h>
 #include <cstdio>
 
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+#include <executorch/devtools/etdump/etdump_schema_flatcc_builder.h>
+#include <executorch/devtools/etdump/etdump_schema_flatcc_reader.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
-#include <executorch/sdk/etdump/etdump_schema_flatcc_builder.h>
-#include <executorch/sdk/etdump/etdump_schema_flatcc_reader.h>
 #include <executorch/test/utils/DeathTest.h>
 #include <cstdint>
 #include <cstring>
 #include <memory>
 
-namespace torch {
-namespace executor {
+using ::exec_aten::ScalarType;
+using ::exec_aten::Tensor;
+using ::executorch::etdump::ETDumpGen;
+using ::executorch::etdump::ETDumpResult;
+using ::executorch::runtime::AllocatorID;
+using ::executorch::runtime::ArrayRef;
+using ::executorch::runtime::BoxedEvalueList;
+using ::executorch::runtime::DelegateDebugIdType;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::EventTracerEntry;
+using ::executorch::runtime::LoggedEValueType;
+using ::executorch::runtime::Span;
+using ::executorch::runtime::Tag;
+using ::executorch::runtime::testing::TensorFactory;
 
 class ProfilerETDumpTest : public ::testing::Test {
  protected:
@@ -49,7 +61,7 @@ TEST_F(ProfilerETDumpTest, SingleProfileEvent) {
     EventTracerEntry entry = etdump_gen[i]->start_profiling("test_event", 0, 1);
     etdump_gen[i]->end_profiling(entry);
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -105,7 +117,7 @@ TEST_F(ProfilerETDumpTest, EmptyBlocks) {
         etdump_gen[i]->start_profiling("test_event_1", 0, 1);
     etdump_gen[i]->end_profiling(entry);
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -160,7 +172,7 @@ TEST_F(ProfilerETDumpTest, AllocationEvents) {
 
 TEST_F(ProfilerETDumpTest, DebugEvent) {
   for (size_t i = 0; i < 2; i++) {
-    testing::TensorFactory<ScalarType::Float> tf;
+    TensorFactory<ScalarType::Float> tf;
     EValue evalue(tf.ones({3, 2}));
 
     etdump_gen[i]->create_event_block("test_block");
@@ -189,7 +201,7 @@ TEST_F(ProfilerETDumpTest, DebugEvent) {
 
 TEST_F(ProfilerETDumpTest, DebugEventTensorList) {
   for (size_t i = 0; i < 2; i++) {
-    testing::TensorFactory<ScalarType::Int> tf;
+    TensorFactory<ScalarType::Int> tf;
     exec_aten::Tensor storage[2] = {tf.ones({3, 2}), tf.ones({3, 2})};
     EValue evalue_1(storage[0]);
     EValue evalue_2(storage[1]);
@@ -212,7 +224,7 @@ TEST_F(ProfilerETDumpTest, DebugEventTensorList) {
 }
 
 TEST_F(ProfilerETDumpTest, VerifyLogging) {
-  testing::TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Float> tf;
   EValue evalue(tf.ones({3, 2}));
 
   for (size_t i = 0; i < 2; i++) {
@@ -225,7 +237,7 @@ TEST_F(ProfilerETDumpTest, VerifyLogging) {
     etdump_gen[i]->log_evalue(evalue);
     etdump_gen[i]->log_evalue(evalue, LoggedEValueType::kProgramOutput);
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -297,7 +309,7 @@ TEST_F(ProfilerETDumpTest, MultipleBlocksWithEvents) {
     entry = etdump_gen[i]->start_profiling("test_event", 0, 1);
     etdump_gen[i]->end_profiling(entry);
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -363,7 +375,7 @@ TEST_F(ProfilerETDumpTest, VerifyData) {
     entry = etdump_gen[i]->start_profiling("test_event2", 0, 1);
     etdump_gen[i]->end_profiling(entry);
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -421,7 +433,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) {
     Span<uint8_t> buffer((uint8_t*)ptr, 2048);
 
     etdump_gen[i]->create_event_block("test_block");
-    testing::TensorFactory<ScalarType::Float> tf;
+    TensorFactory<ScalarType::Float> tf;
 
     ET_EXPECT_DEATH(
         etdump_gen[i]->log_intermediate_output_delegate(
@@ -462,7 +474,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) {
         static_cast<torch::executor::DebugHandle>(-1),
         true);
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -474,7 +486,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) {
 }
 
 TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) {
-  testing::TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Float> tf;
   EValue evalue(tf.ones({3, 2}));
 
   for (size_t i = 0; i < 2; i++) {
@@ -492,7 +504,7 @@ TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) {
     etdump_gen[i]->log_intermediate_output_delegate(
         nullptr, 258, tf.ones({5, 6}));
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -603,7 +615,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateEvents) {
         etdump_gen[i]->end_profiling(entry),
         "Delegate events must use end_profiling_delegate to mark the end of a delegate profiling event.");
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -681,7 +693,7 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) {
           etdump_gen[i]->start_profiling("test_event", 0, 1);
       etdump_gen[i]->end_profiling(entry);
 
-      etdump_result result = etdump_gen[i]->get_etdump_data();
+      ETDumpResult result = etdump_gen[i]->get_etdump_data();
       ASSERT_TRUE(result.buf != nullptr);
       ASSERT_TRUE(result.size != 0);
 
@@ -712,6 +724,3 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) {
     }
   }
 }
-
-} // namespace executor
-} // namespace torch
diff --git a/sdk/etdump/tests/serialize_test.py b/devtools/etdump/tests/serialize_test.py
similarity index 96%
rename from sdk/etdump/tests/serialize_test.py
rename to devtools/etdump/tests/serialize_test.py
index 2b1497f5974..5cab3e5b2ba 100644
--- a/sdk/etdump/tests/serialize_test.py
+++ b/devtools/etdump/tests/serialize_test.py
@@ -12,13 +12,13 @@
 from pprint import pformat
 from typing import List
 
-import executorch.sdk.etdump.schema_flatcc as flatcc
-from executorch.exir._serialize._dataclass import _DataclassEncoder
+import executorch.devtools.etdump.schema_flatcc as flatcc
 
-from executorch.sdk.etdump.serialize import (
+from executorch.devtools.etdump.serialize import (
     deserialize_from_etdump_flatcc,
     serialize_to_etdump_flatcc,
 )
+from executorch.exir._serialize._dataclass import _DataclassEncoder
 
 
 def diff_jsons(a: str, b: str) -> List[str]:
@@ -83,6 +83,7 @@ def get_sample_etdump_flatcc() -> flatcc.ETDumpFlatCC:
                         profile_event=None,
                         allocation_event=None,
                         debug_event=flatcc.DebugEvent(
+                            name="test_debug_event",
                             chain_index=1,
                             instruction_id=0,
                             delegate_debug_id_str="56",
diff --git a/sdk/etdump/tests/targets.bzl b/devtools/etdump/tests/targets.bzl
similarity index 82%
rename from sdk/etdump/tests/targets.bzl
rename to devtools/etdump/tests/targets.bzl
index 41b19ca65ef..5299b7c1cb7 100644
--- a/sdk/etdump/tests/targets.bzl
+++ b/devtools/etdump/tests/targets.bzl
@@ -13,8 +13,8 @@ def define_common_targets():
             "etdump_test.cpp",
         ],
         deps = [
-            "//executorch/sdk/etdump:etdump_flatcc",
-            "//executorch/sdk/etdump:etdump_schema_flatcc",
+            "//executorch/devtools/etdump:etdump_flatcc",
+            "//executorch/devtools/etdump:etdump_schema_flatcc",
             "//executorch/runtime/platform:platform",
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
         ],
diff --git a/sdk/etrecord/TARGETS b/devtools/etrecord/TARGETS
similarity index 71%
rename from sdk/etrecord/TARGETS
rename to devtools/etrecord/TARGETS
index c7de63a81f4..09fc3212bf8 100644
--- a/sdk/etrecord/TARGETS
+++ b/devtools/etrecord/TARGETS
@@ -9,10 +9,10 @@ python_library(
         "_etrecord.py",
     ],
     deps = [
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/bundled_program/schema:bundled_program_schema_py",
         "//executorch/exir:lib",
         "//executorch/exir/emit:emit",
         "//executorch/exir/serde:serialize",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/bundled_program/schema:bundled_program_schema_py",
     ],
 )
diff --git a/sdk/etrecord/__init__.py b/devtools/etrecord/__init__.py
similarity index 86%
rename from sdk/etrecord/__init__.py
rename to devtools/etrecord/__init__.py
index 29c29462a7e..59ff4e44c2f 100644
--- a/sdk/etrecord/__init__.py
+++ b/devtools/etrecord/__init__.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from executorch.sdk.etrecord._etrecord import (
+from executorch.devtools.etrecord._etrecord import (
     ETRecord,
     generate_etrecord,
     parse_etrecord,
diff --git a/sdk/etrecord/_etrecord.py b/devtools/etrecord/_etrecord.py
similarity index 96%
rename from sdk/etrecord/_etrecord.py
rename to devtools/etrecord/_etrecord.py
index 55e231f2166..de7cf93990a 100644
--- a/sdk/etrecord/_etrecord.py
+++ b/devtools/etrecord/_etrecord.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import json
 import os
 import pickle
@@ -12,6 +14,9 @@
 from zipfile import BadZipFile, ZipFile
 
 from executorch import exir
+from executorch.devtools.bundled_program.core import BundledProgram
+
+from executorch.devtools.bundled_program.schema.bundled_program_schema import Value
 from executorch.exir import (
     EdgeProgramManager,
     ExecutorchProgram,
@@ -23,9 +28,6 @@
 
 from executorch.exir.serde.export_serialize import SerializedArtifact
 from executorch.exir.serde.serialize import deserialize, serialize
-from executorch.sdk.bundled_program.core import BundledProgram
-
-from executorch.sdk.bundled_program.schema.bundled_program_schema import Value
 
 ProgramOutput = List[Value]
 
@@ -182,13 +184,13 @@ def generate_etrecord(
     is the closest graph module representation of what is eventually run on the device.
     In addition to all the graph modules, we also serialize the program buffer, which the users
     can provide to the ExecuTorch runtime to run the model, and the debug handle map
-    for SDK tooling usage.
+    for Developer Tools usage.
 
     Args:
-        etrecord_path: Path to where the `ETRecord` file will be saved to.
+        et_record: Path to where the `ETRecord` file will be saved to.
         edge_dialect_program: `EdgeProgramManager` for this model returned by the call to to_edge()
         executorch_program: The ExecuTorch program for this model returned by the call to `to_executorch()` or the `BundledProgram` of this model
-        export_modules[Optional]: **Should be ignored by OSS users**. A dictionary of graph modules with the key being the user provided name and the
+        export_modules [Optional]: **Should be ignored by OSS users**. A dictionary of graph modules with the key being the user provided name and the
             value being the corresponding exported module. The exported graph modules can be either the
             output of `torch.export()` or `exir.to_edge()`.
 
@@ -201,7 +203,7 @@ def generate_etrecord(
 
     etrecord_zip = ZipFile(et_record, "w")
     # Write the magic file identifier that will be used to verify that this file
-    # is an etrecord when it's used later in the SDK tooling.
+    # is an etrecord when it's used later in the Developer Tools.
     etrecord_zip.writestr(ETRecordReservedFileNames.ETRECORD_IDENTIFIER, "")
 
     if export_modules is not None:
diff --git a/sdk/etrecord/tests/TARGETS b/devtools/etrecord/tests/TARGETS
similarity index 64%
rename from sdk/etrecord/tests/TARGETS
rename to devtools/etrecord/tests/TARGETS
index 0984c755a4e..fffa7f18341 100644
--- a/sdk/etrecord/tests/TARGETS
+++ b/devtools/etrecord/tests/TARGETS
@@ -8,11 +8,11 @@ python_unittest(
     srcs = ["etrecord_test.py"],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/etrecord:etrecord",
         "//executorch/exir:lib",
         "//executorch/exir/tests:models",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/etrecord:etrecord",
     ],
 )
 
@@ -21,10 +21,10 @@ python_library(
     srcs = ["etrecord_test.py"],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/etrecord:etrecord",
         "//executorch/exir:lib",
         "//executorch/exir/tests:models",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/etrecord:etrecord",
     ],
 )
diff --git a/sdk/etrecord/tests/etrecord_test.py b/devtools/etrecord/tests/etrecord_test.py
similarity index 95%
rename from sdk/etrecord/tests/etrecord_test.py
rename to devtools/etrecord/tests/etrecord_test.py
index bc534fd4871..daef7c3e1e2 100644
--- a/sdk/etrecord/tests/etrecord_test.py
+++ b/devtools/etrecord/tests/etrecord_test.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import copy
 import json
 import tempfile
@@ -12,14 +14,14 @@
 import executorch.exir.tests.models as models
 import torch
 from executorch import exir
-from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.core import BundledProgram
-from executorch.sdk.etrecord import generate_etrecord, parse_etrecord
-from executorch.sdk.etrecord._etrecord import (
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.core import BundledProgram
+from executorch.devtools.etrecord import generate_etrecord, parse_etrecord
+from executorch.devtools.etrecord._etrecord import (
     _get_reference_outputs,
     ETRecordReservedFileNames,
 )
+from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
 from torch.export import export
 
 
@@ -75,7 +77,7 @@ def get_test_model_with_manager(self):
         return (aten_dialect, edge_program_copy, edge_program.to_executorch())
 
     # Serialized and deserialized graph modules are not completely the same, so we check
-    # that they are close enough and match especially on the parameters we care about in the SDK.
+    # that they are close enough and match especially on the parameters we care about in the Developer Tools.
     def check_graph_closeness(self, graph_a, graph_b):
         self.assertEqual(len(graph_a.graph.nodes), len(graph_b.graph.nodes))
         for node_a, node_b in zip(graph_a.graph.nodes, graph_b.graph.nodes):
diff --git a/sdk/inspector/TARGETS b/devtools/inspector/TARGETS
similarity index 67%
rename from sdk/inspector/TARGETS
rename to devtools/inspector/TARGETS
index bc53c90c115..bba5f7f8951 100644
--- a/sdk/inspector/TARGETS
+++ b/devtools/inspector/TARGETS
@@ -14,10 +14,10 @@ python_library(
         "fbsource//third-party/pypi/pandas:pandas",
         "fbsource//third-party/pypi/tabulate:tabulate",
         ":inspector_utils",
+        "//executorch/devtools/debug_format:et_schema",
+        "//executorch/devtools/etdump:schema_flatcc",
+        "//executorch/devtools/etrecord:etrecord",
         "//executorch/exir:lib",
-        "//executorch/sdk/debug_format:et_schema",
-        "//executorch/sdk/etdump:schema_flatcc",
-        "//executorch/sdk/etrecord:etrecord",
     ],
 )
 
@@ -26,8 +26,8 @@ python_binary(
     main_function = ".inspector_cli.main",
     main_src = "inspector_cli.py",
     deps = [
-        ":inspector_utils",
-        "//executorch/sdk:lib",
+        "//executorch/devtools:lib",
+        "//executorch/devtools/inspector:lib",
     ],
 )
 
@@ -40,11 +40,11 @@ python_library(
         "fbsource//third-party/pypi/matplotlib:matplotlib",
         "fbsource//third-party/pypi/numpy:numpy",
         "//caffe2:torch",
-        "//executorch/sdk/debug_format:base_schema",
-        "//executorch/sdk/debug_format:et_schema",
-        "//executorch/sdk/etdump:schema_flatcc",
-        "//executorch/sdk/etdump:serialize",
-        "//executorch/sdk/etrecord:etrecord",
+        "//executorch/devtools/debug_format:base_schema",
+        "//executorch/devtools/debug_format:et_schema",
+        "//executorch/devtools/etdump:schema_flatcc",
+        "//executorch/devtools/etdump:serialize",
+        "//executorch/devtools/etrecord:etrecord",
     ],
 )
 
diff --git a/devtools/inspector/__init__.py b/devtools/inspector/__init__.py
new file mode 100644
index 00000000000..375123a0a5b
--- /dev/null
+++ b/devtools/inspector/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from executorch.devtools.inspector._inspector import (
+    Event,
+    EventBlock,
+    Inspector,
+    PerfData,
+)
+from executorch.devtools.inspector._inspector_utils import compare_results, TimeScale
+
+__all__ = [
+    "Event",
+    "EventBlock",
+    "Inspector",
+    "PerfData",
+    "compare_results",
+    "TimeScale",
+]
diff --git a/sdk/inspector/_inspector.py b/devtools/inspector/_inspector.py
similarity index 94%
rename from sdk/inspector/_inspector.py
rename to devtools/inspector/_inspector.py
index 5f9bfafee70..0539d4f5e4b 100644
--- a/sdk/inspector/_inspector.py
+++ b/devtools/inspector/_inspector.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import dataclasses
 import logging
 import sys
@@ -26,16 +28,20 @@
     Union,
 )
 
-import executorch.sdk.etdump.schema_flatcc as flatcc
+import executorch.devtools.etdump.schema_flatcc as flatcc
 
 import numpy as np
 import pandas as pd
-from executorch.exir import ExportedProgram
 
-from executorch.sdk.debug_format.et_schema import OperatorGraph, OperatorNode
-from executorch.sdk.etdump.schema_flatcc import DebugEvent, ETDumpFlatCC, ProfileEvent
-from executorch.sdk.etrecord import ETRecord, parse_etrecord
-from executorch.sdk.inspector._inspector_utils import (
+from executorch.devtools.debug_format.et_schema import OperatorGraph, OperatorNode
+from executorch.devtools.etdump.schema_flatcc import (
+    DebugEvent,
+    ETDumpFlatCC,
+    ProfileEvent,
+)
+from executorch.devtools.etrecord import ETRecord, parse_etrecord
+from executorch.devtools.inspector._inspector_utils import (
+    calculate_time_scale_factor,
     create_debug_handle_to_op_node_mapping,
     EDGE_DIALECT_GRAPH_KEY,
     EXCLUDED_COLUMNS_WHEN_PRINTING,
@@ -49,10 +55,10 @@
     is_inference_output_equal,
     ProgramOutput,
     RESERVED_FRAMEWORK_EVENT_NAMES,
-    TIME_SCALE_DICT,
     TimeScale,
     verify_debug_data_equivalence,
 )
+from executorch.exir import ExportedProgram
 
 from tabulate import tabulate
 
@@ -146,6 +152,7 @@ def _gen_from_event(event: ProfileEvent) -> "ProfileEventSignature":
 # Signature of a DebugEvent
 @dataclass(frozen=True, order=True)
 class DebugEventSignature:
+    name: str = ""
     instruction_id: Optional[int] = -1
     delegate_id: Optional[int] = None
     delegate_id_str: Optional[str] = None
@@ -159,6 +166,7 @@ def _gen_from_event(event: DebugEvent) -> "DebugEventSignature":
         The Signature will convert these back to the intended None value
         """
         return DebugEventSignature(
+            event.name or "",
             event.instruction_id if event.instruction_id != -1 else None,
             event.delegate_debug_id_int if event.delegate_debug_id_int != -1 else None,
             event.delegate_debug_id_str if event.delegate_debug_id_str != "" else None,
@@ -464,46 +472,63 @@ def _calculate_elapsed_time(start_time, end_time):
         return elapsed_time
 
     @staticmethod
-    def _populate_profiling_related_fields(
+    def _populate_event_signature_fields(
         ret_event: "Event",
-        profile_event_signature: Optional[ProfileEventSignature],
-        events: List[InstructionEvent],
-        scale_factor: float,
+        event_signature: Optional[Union[ProfileEventSignature, DebugEventSignature]],
     ) -> None:
         """
         Given a partially constructed Event, populate the fields related to
-        the profile events
+        the profile event signature or debug event signature
 
         Fields Updated:
             name
             delegate_debug_identifier
             is_delegated_op
-            perf_data
-            delegate_debug_metadatas
         """
-
-        # Fill out fields from profile event signature
-        if profile_event_signature is not None:
-            if profile_event_signature.delegate_id is not None:  # 0 is a valid value
-                delegate_debug_identifier = profile_event_signature.delegate_id
+        # TODO: T201347372 Push the None check to ealier in the stack.
+        if event_signature is not None:
+            if event_signature.delegate_id is not None:  # 0 is a valid value
+                delegate_debug_identifier = event_signature.delegate_id
             else:
-                delegate_debug_identifier = (
-                    profile_event_signature.delegate_id_str or None
-                )
+                delegate_debug_identifier = event_signature.delegate_id_str or None
 
             # Use the delegate identifier as the event name if delegated
             is_delegated_op = delegate_debug_identifier is not None
             name = (
-                profile_event_signature.name
+                event_signature.name
                 if not is_delegated_op
                 else str(delegate_debug_identifier)
             )
 
             # Update fields
-            ret_event.name = name
+            # This is for older version of etdump that doesn't have the name field for debug events, we don't update the name field
+            if name:
+                ret_event.name = name
             ret_event.delegate_debug_identifier = delegate_debug_identifier
             ret_event.is_delegated_op = is_delegated_op
 
+    @staticmethod
+    def _populate_profiling_related_fields(
+        ret_event: "Event",
+        profile_event_signature: Optional[ProfileEventSignature],
+        events: List[InstructionEvent],
+        scale_factor: float,
+    ) -> None:
+        """
+        Given a partially constructed Event, populate the fields related to
+        the profile events
+
+        Fields Updated:
+            name
+            delegate_debug_identifier
+            is_delegated_op
+            perf_data
+            delegate_debug_metadatas
+        """
+
+        # Fill out fields from profile event signature
+        Event._populate_event_signature_fields(ret_event, profile_event_signature)
+
         # Fill out fields from profile event
         data = []
         delegate_debug_metadatas = []
@@ -571,9 +596,15 @@ def _populate_debugging_related_fields(
         the debug events
 
         Fields Updated:
+            name
+            delegate_debug_identifier
+            is_delegated_op
             debug_data
         """
 
+        # Fill out fields from debug event signature
+        Event._populate_event_signature_fields(ret_event, debug_event_signature)
+
         debug_data: List[flatcc.Value] = []
         for event in events:
             if (debug_events := event.debug_events) is None:
@@ -795,9 +826,7 @@ class GroupedRunInstances:
 
         # Construct the EventBlocks
         event_blocks = []
-        scale_factor = (
-            TIME_SCALE_DICT[source_time_scale] / TIME_SCALE_DICT[target_time_scale]
-        )
+        scale_factor = calculate_time_scale_factor(source_time_scale, target_time_scale)
         for run_signature, grouped_run_instance in run_groups.items():
             run_group: OrderedDict[EventSignature, List[InstructionEvent]] = (
                 grouped_run_instance.events
@@ -962,6 +991,9 @@ def __init__(
             debug_buffer_path: Debug buffer file path that contains the debug data referenced by ETDump for intermediate and program outputs.
             delegate_metadata_parser: Optional function to parse delegate metadata from an Profiling Event. Expected signature of the function is:
                     (delegate_metadata_list: List[bytes]) -> Union[List[str], Dict[str, Any]]
+            delegate_time_scale_converter: Optional function to convert the time scale of delegate profiling data. If not given, use the conversion ratio of
+                    target_time_scale/source_time_scale.
+            enable_module_hierarchy: Enable submodules in the operator graph. Defaults to False.
 
         Returns:
             None
@@ -976,6 +1008,14 @@ def __init__(
         self._source_time_scale = source_time_scale
         self._target_time_scale = target_time_scale
 
+        if delegate_time_scale_converter is None:
+            scale_factor = calculate_time_scale_factor(
+                source_time_scale, target_time_scale
+            )
+            delegate_time_scale_converter = (
+                lambda event_name, input_time: input_time / scale_factor
+            )
+
         if etrecord is None:
             self._etrecord = None
         elif isinstance(etrecord, ETRecord):
@@ -998,10 +1038,10 @@ def __init__(
             )
 
         self.event_blocks = EventBlock._gen_from_etdump(
-            etdump,
-            self._source_time_scale,
-            self._target_time_scale,
-            output_buffer,
+            etdump=etdump,
+            source_time_scale=self._source_time_scale,
+            target_time_scale=self._target_time_scale,
+            output_buffer=output_buffer,
             delegate_metadata_parser=delegate_metadata_parser,
             delegate_time_scale_converter=delegate_time_scale_converter,
         )
diff --git a/sdk/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py
similarity index 95%
rename from sdk/inspector/_inspector_utils.py
rename to devtools/inspector/_inspector_utils.py
index 6879e855057..5f04e2d0413 100644
--- a/sdk/inspector/_inspector_utils.py
+++ b/devtools/inspector/_inspector_utils.py
@@ -4,18 +4,20 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import math
 from enum import Enum
 from typing import Dict, List, Mapping, Optional, Tuple, TypeAlias, Union
 
-import executorch.sdk.etdump.schema_flatcc as flatcc
+import executorch.devtools.etdump.schema_flatcc as flatcc
 
 import torch
 
-from executorch.sdk.debug_format.base_schema import OperatorNode
+from executorch.devtools.debug_format.base_schema import OperatorNode
 
-from executorch.sdk.debug_format.et_schema import FXOperatorGraph, OperatorGraph
-from executorch.sdk.etdump.schema_flatcc import (
+from executorch.devtools.debug_format.et_schema import FXOperatorGraph, OperatorGraph
+from executorch.devtools.etdump.schema_flatcc import (
     DebugEvent,
     ETDumpFlatCC,
     ProfileEvent,
@@ -25,8 +27,8 @@
     ValueType,
 )
 
-from executorch.sdk.etdump.serialize import deserialize_from_etdump_flatcc
-from executorch.sdk.etrecord import ETRecord
+from executorch.devtools.etdump.serialize import deserialize_from_etdump_flatcc
+from executorch.devtools.etrecord import ETRecord
 
 FORWARD = "forward"
 EDGE_DIALECT_GRAPH_KEY = "edge_dialect_graph_module"
@@ -63,6 +65,15 @@ class TimeScale(Enum):
 }
 
 
+def calculate_time_scale_factor(
+    source_time_scale: TimeScale, target_time_scale: TimeScale
+) -> float:
+    """
+    Calculate the factor (source divided by target) between two time scales
+    """
+    return TIME_SCALE_DICT[source_time_scale] / TIME_SCALE_DICT[target_time_scale]
+
+
 # Model Debug Output
 InferenceOutput: TypeAlias = Union[
     torch.Tensor, List[torch.Tensor], int, float, str, bool, None
diff --git a/sdk/inspector/inspector_cli.py b/devtools/inspector/inspector_cli.py
similarity index 93%
rename from sdk/inspector/inspector_cli.py
rename to devtools/inspector/inspector_cli.py
index d6c8d5442f3..db3536a84bf 100644
--- a/sdk/inspector/inspector_cli.py
+++ b/devtools/inspector/inspector_cli.py
@@ -4,10 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import argparse
 
-from executorch.sdk import Inspector
-from executorch.sdk.inspector._inspector_utils import compare_results, TimeScale
+from executorch.devtools import Inspector
+from executorch.devtools.inspector import compare_results, TimeScale
 
 
 def main() -> None:
diff --git a/devtools/inspector/tests/TARGETS b/devtools/inspector/tests/TARGETS
new file mode 100644
index 00000000000..eada6817bcb
--- /dev/null
+++ b/devtools/inspector/tests/TARGETS
@@ -0,0 +1,41 @@
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+
+oncall("executorch")
+
+python_unittest(
+    name = "inspector_test",
+    srcs = ["inspector_test.py"],
+    deps = [
+        "//executorch/devtools:lib",
+        "//executorch/devtools/debug_format:et_schema",
+        "//executorch/devtools/etdump:schema_flatcc",
+        "//executorch/devtools/etrecord/tests:etrecord_test_library",
+        "//executorch/devtools/inspector:inspector",
+        "//executorch/devtools/inspector:lib",
+        "//executorch/exir:lib",
+    ],
+)
+
+python_unittest(
+    name = "event_blocks_test",
+    srcs = ["event_blocks_test.py"],
+    deps = [
+        "//executorch/devtools/etdump:schema_flatcc",
+        "//executorch/devtools/inspector:inspector",
+        "//executorch/devtools/inspector:lib",
+    ],
+)
+
+python_unittest(
+    name = "inspector_utils_test",
+    srcs = ["inspector_utils_test.py"],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/devtools:lib",
+        "//executorch/devtools/debug_format:base_schema",
+        "//executorch/devtools/debug_format:et_schema",
+        "//executorch/devtools/etdump:schema_flatcc",
+        "//executorch/devtools/etrecord/tests:etrecord_test_library",
+        "//executorch/devtools/inspector:inspector_utils",
+    ],
+)
diff --git a/sdk/inspector/tests/event_blocks_test.py b/devtools/inspector/tests/event_blocks_test.py
similarity index 89%
rename from sdk/inspector/tests/event_blocks_test.py
rename to devtools/inspector/tests/event_blocks_test.py
index 7c7da001860..85b65aa5f34 100644
--- a/sdk/inspector/tests/event_blocks_test.py
+++ b/devtools/inspector/tests/event_blocks_test.py
@@ -8,10 +8,10 @@
 import unittest
 from typing import List, Optional, Tuple, Union
 
-import executorch.sdk.etdump.schema_flatcc as flatcc
-from executorch.sdk.etdump.schema_flatcc import ETDumpFlatCC, ProfileEvent
-from executorch.sdk.inspector import Event, EventBlock, PerfData
-from executorch.sdk.inspector._inspector import (
+import executorch.devtools.etdump.schema_flatcc as flatcc
+from executorch.devtools.etdump.schema_flatcc import ETDumpFlatCC, ProfileEvent
+from executorch.devtools.inspector import Event, EventBlock, PerfData
+from executorch.devtools.inspector._inspector import (
     DelegateMetadata,
     EventSignature,
     InstructionEvent,
@@ -62,6 +62,7 @@ def _gen_sample_profile_event(
     def _gen_sample_debug_event(
         instruction_id: int,
         delegate_debug_id: Optional[Union[int, str]] = None,
+        name: str = "test_debug_event",
     ) -> flatcc.DebugEvent:
         """
         Helper for generating test DebugEvents
@@ -77,6 +78,7 @@ def _gen_sample_debug_event(
         )
 
         return flatcc.DebugEvent(
+            name=name,
             chain_index=0,
             instruction_id=instruction_id,
             delegate_debug_id_int=delegate_debug_id_int,
@@ -299,6 +301,42 @@ def _get_sample_etdump_flatcc_profiling_and_debugging() -> flatcc.ETDumpFlatCC:
 
         return ETDumpFlatCC(version=0, run_data=[run_data_1, run_data_2, run_data_3])
 
+    @staticmethod
+    def _get_sample_etdump_flatcc_debug_events_only(
+        event_name: str,
+        delegate_debug_id: str,
+    ) -> flatcc.ETDumpFlatCC:
+        """
+        Helper for getting a sample ETDumpFlatCC object with RunData signature_a
+        and (debug_event_delegated, debug_event_non_delegated, no profile event)
+        """
+
+        debug_event_delegated = TestEventBlock._gen_sample_debug_event(
+            instruction_id=1, delegate_debug_id=delegate_debug_id, name=event_name
+        )
+        debug_event_non_delegated = TestEventBlock._gen_sample_debug_event(
+            instruction_id=1, name=event_name
+        )
+        run_data_1 = flatcc.RunData(
+            name="signature_a",
+            bundled_input_index=-1,
+            allocators=[],
+            events=[
+                flatcc.Event(
+                    allocation_event=None,
+                    debug_event=debug_event_delegated,
+                    profile_event=None,
+                ),
+                flatcc.Event(
+                    allocation_event=None,
+                    debug_event=debug_event_non_delegated,
+                    profile_event=None,
+                ),
+            ],
+        )
+
+        return ETDumpFlatCC(version=0, run_data=[run_data_1])
+
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
     def test_gen_from_etdump(self) -> None:
@@ -370,6 +408,30 @@ def test_gen_from_etdump_inconsistent_debug_data(self) -> None:
         with self.assertRaises(AssertionError):
             EventBlock._gen_from_etdump(etdump)
 
+    def test_gen_from_etdump_debug_events_only(self) -> None:
+        """
+        Test generation of EventBlocks given an ETDump with only debugging events
+
+        Specifically it tests:
+        - Correct number of EventBlocks and Events
+        - Correct name of each Event
+        """
+        event_name = "test_debug_event_only"
+        delegate_debug_id = "debug_id"
+        etdump: ETDumpFlatCC = (
+            TestEventBlock._get_sample_etdump_flatcc_debug_events_only(
+                event_name=event_name,
+                delegate_debug_id=delegate_debug_id,
+            )
+        )
+        event_blocks = EventBlock._gen_from_etdump(etdump)
+        self.assertEqual(len(event_blocks), 1)
+        self.assertEqual(len(event_blocks[0].events), 2)
+        # Delegated event uses delegate_debug_id as event name
+        self.assertEqual(event_blocks[0].events[0].name, delegate_debug_id)
+        # Non delegated event uses event_name as event name
+        self.assertEqual(event_blocks[0].events[1].name, event_name)
+
     def test_inspector_event_generation(self) -> None:
         """
         Test Inspector.Event derivation from various ProfileEvent cases
diff --git a/sdk/inspector/tests/inspector_test.py b/devtools/inspector/tests/inspector_test.py
similarity index 89%
rename from sdk/inspector/tests/inspector_test.py
rename to devtools/inspector/tests/inspector_test.py
index a372c7c569c..34c96eef534 100644
--- a/sdk/inspector/tests/inspector_test.py
+++ b/devtools/inspector/tests/inspector_test.py
@@ -4,31 +4,41 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import random
 import statistics
 import tempfile
 import unittest
 from contextlib import redirect_stdout
 
-from typing import List
+from typing import Callable, List
 
 from unittest.mock import patch
 
-from executorch.exir import ExportedProgram
-from executorch.sdk import generate_etrecord, parse_etrecord
-from executorch.sdk.debug_format.et_schema import OperatorNode
-from executorch.sdk.etdump.schema_flatcc import ProfileEvent
-from executorch.sdk.etrecord.tests.etrecord_test import TestETRecord
-
-from executorch.sdk.inspector import _inspector, Event, EventBlock, Inspector, PerfData
-from executorch.sdk.inspector._inspector import (
+from executorch.devtools import generate_etrecord, parse_etrecord
+from executorch.devtools.debug_format.et_schema import OperatorNode
+from executorch.devtools.etdump.schema_flatcc import ProfileEvent
+from executorch.devtools.etrecord.tests.etrecord_test import TestETRecord
+
+from executorch.devtools.inspector import (
+    _inspector,
+    Event,
+    EventBlock,
+    Inspector,
+    PerfData,
+)
+from executorch.devtools.inspector._inspector import (
     DebugEventSignature,
     flatcc,
     InstructionEvent,
     InstructionEventSignature,
     ProfileEventSignature,
+    TimeScale,
 )
 
+from executorch.exir import ExportedProgram
+
 
 OP_TYPE = "aten::add"
 EVENT_BLOCK_NAME = "block_0"
@@ -81,6 +91,33 @@ def test_inspector_constructor(self):
             # Because we mocked parse_etrecord() to return None, this method shouldn't be called
             mock_gen_graphs_from_etrecord.assert_not_called()
 
+    def test_default_delegate_time_scale_converter(self):
+        # Create a context manager to patch functions called by Inspector.__init__
+        with patch.object(
+            _inspector, "parse_etrecord", return_value=None
+        ), patch.object(
+            _inspector, "gen_etdump_object", return_value=None
+        ), patch.object(
+            EventBlock, "_gen_from_etdump"
+        ) as mock_gen_from_etdump, patch.object(
+            _inspector, "gen_graphs_from_etrecord"
+        ), patch.object(
+            _inspector, "create_debug_handle_to_op_node_mapping"
+        ):
+            # Call the constructor of Inspector
+            Inspector(
+                etdump_path=ETDUMP_PATH,
+                etrecord=ETRECORD_PATH,
+                source_time_scale=TimeScale.US,
+                target_time_scale=TimeScale.S,
+            )
+
+            # Verify delegate_time_scale_converter is set to be a callable
+            self.assertIsInstance(
+                mock_gen_from_etdump.call_args.get("delegate_time_scale_converter"),
+                Callable,
+            )
+
     def test_inspector_print_data_tabular(self):
         # Create a context manager to patch functions called by Inspector.__init__
         with patch.object(
@@ -281,6 +318,7 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self):
         )
 
         debug_event_0 = flatcc.DebugEvent(
+            name="event",
             chain_index=1,
             instruction_id=0,
             delegate_debug_id_int=1,
@@ -304,6 +342,7 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self):
 
         # Note the sizes of this tensor are different from the previous one
         debug_event_1 = flatcc.DebugEvent(
+            name="event",
             chain_index=1,
             instruction_id=0,
             delegate_debug_id_int=1,
@@ -348,6 +387,7 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self):
         )
 
         debug_event_0 = flatcc.DebugEvent(
+            name="event",
             chain_index=1,
             instruction_id=0,
             delegate_debug_id_int=1,
@@ -371,6 +411,7 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self):
 
         # Same as the event above except for offset
         debug_event_1 = flatcc.DebugEvent(
+            name="event",
             chain_index=1,
             instruction_id=0,
             delegate_debug_id_int=1,
diff --git a/sdk/inspector/tests/inspector_utils_test.py b/devtools/inspector/tests/inspector_utils_test.py
similarity index 88%
rename from sdk/inspector/tests/inspector_utils_test.py
rename to devtools/inspector/tests/inspector_utils_test.py
index b5b9b54d6c4..73511f5fcd7 100644
--- a/sdk/inspector/tests/inspector_utils_test.py
+++ b/devtools/inspector/tests/inspector_utils_test.py
@@ -4,30 +4,34 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import tempfile
 import unittest
 from typing import Dict, Tuple
 
 import torch
 
-from executorch.sdk import generate_etrecord, parse_etrecord
+from executorch.devtools import generate_etrecord, parse_etrecord
 
-from executorch.sdk.debug_format.base_schema import (
+from executorch.devtools.debug_format.base_schema import (
     OperatorGraph,
     OperatorNode,
     ValueNode,
 )
 
-from executorch.sdk.debug_format.et_schema import FXOperatorGraph
-from executorch.sdk.etdump import schema_flatcc as flatcc
+from executorch.devtools.debug_format.et_schema import FXOperatorGraph
+from executorch.devtools.etdump import schema_flatcc as flatcc
 
-from executorch.sdk.etrecord.tests.etrecord_test import TestETRecord
-from executorch.sdk.inspector._inspector_utils import (
+from executorch.devtools.etrecord.tests.etrecord_test import TestETRecord
+from executorch.devtools.inspector._inspector_utils import (
+    calculate_time_scale_factor,
     create_debug_handle_to_op_node_mapping,
     EDGE_DIALECT_GRAPH_KEY,
     find_populated_event,
     gen_graphs_from_etrecord,
     is_inference_output_equal,
+    TimeScale,
 )
 
 
@@ -74,6 +78,7 @@ def test_find_populated_event(self):
             end_time=2002,
         )
         debug_event = flatcc.DebugEvent(
+            name="test_debug_event",
             chain_index=1,
             instruction_id=0,
             delegate_debug_id_str="56",
@@ -170,6 +175,19 @@ def test_is_inference_output_equal_returns_true_for_same_strs(self):
             )
         )
 
+    def test_calculate_time_scale_factor_second_based(self):
+        self.assertEqual(
+            calculate_time_scale_factor(TimeScale.NS, TimeScale.MS), 1000000
+        )
+        self.assertEqual(
+            calculate_time_scale_factor(TimeScale.MS, TimeScale.NS), 1 / 1000000
+        )
+
+    def test_calculate_time_scale_factor_cycles(self):
+        self.assertEqual(
+            calculate_time_scale_factor(TimeScale.CYCLES, TimeScale.CYCLES), 1
+        )
+
 
 def gen_mock_operator_graph_with_expected_map() -> (
     Tuple[OperatorGraph, Dict[int, OperatorNode]]
diff --git a/sdk/size_analysis_tool/TARGETS b/devtools/size_analysis_tool/TARGETS
similarity index 86%
rename from sdk/size_analysis_tool/TARGETS
rename to devtools/size_analysis_tool/TARGETS
index 44ae0aa6f8b..c365ba152d5 100644
--- a/sdk/size_analysis_tool/TARGETS
+++ b/devtools/size_analysis_tool/TARGETS
@@ -12,9 +12,9 @@ python_library(
     visibility = ["PUBLIC"],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools:lib",
         "//executorch/exir:lib",
         "//executorch/exir/backend:backend_api",
-        "//executorch/sdk:lib",
     ],
 )
 
@@ -23,13 +23,13 @@ python_binary(
     srcs = [
         "size_analysis_tool.py",
     ],
-    main_function = "executorch.sdk.size_analysis_tool.size_analysis_tool.main",
+    main_function = "executorch.devtools.size_analysis_tool.size_analysis_tool.main",
     visibility = ["PUBLIC"],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools:lib",
         "//executorch/exir:lib",
         "//executorch/exir/backend:backend_api",
-        "//executorch/sdk:lib",
     ],
 )
 
@@ -43,9 +43,9 @@ python_unittest(
         "//caffe2:torch",
         "//executorch/backends/xnnpack/partition:xnnpack_partitioner",
         "//executorch/backends/xnnpack/utils:xnnpack_utils",
+        "//executorch/devtools:lib",
         "//executorch/exir:lib",
         "//executorch/exir/backend:backend_api",
         "//executorch/exir/passes:spec_prop_pass",
-        "//executorch/sdk:lib",
     ],
 )
diff --git a/sdk/size_analysis_tool/size_analysis_tool.py b/devtools/size_analysis_tool/size_analysis_tool.py
similarity index 99%
rename from sdk/size_analysis_tool/size_analysis_tool.py
rename to devtools/size_analysis_tool/size_analysis_tool.py
index d17ec5ac477..8ea8ddbbf49 100644
--- a/sdk/size_analysis_tool/size_analysis_tool.py
+++ b/devtools/size_analysis_tool/size_analysis_tool.py
@@ -9,10 +9,10 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
+from executorch.devtools import parse_etrecord
 
 from executorch.exir import ExportedProgram
 from executorch.exir.backend.backend_api import LoweredBackendModule
-from executorch.sdk import parse_etrecord
 
 
 def _get_tensor_data(node: torch.fx.Node, tensor: torch.Tensor) -> Dict[str, Any]:
diff --git a/sdk/size_analysis_tool/size_analysis_tool_test.py b/devtools/size_analysis_tool/size_analysis_tool_test.py
similarity index 98%
rename from sdk/size_analysis_tool/size_analysis_tool_test.py
rename to devtools/size_analysis_tool/size_analysis_tool_test.py
index 3e1efec77b5..96feae7e423 100644
--- a/sdk/size_analysis_tool/size_analysis_tool_test.py
+++ b/devtools/size_analysis_tool/size_analysis_tool_test.py
@@ -14,12 +14,12 @@
     get_xnnpack_executorch_backend_config,
 )
 from executorch.backends.xnnpack.utils.utils import capture_graph_for_xnnpack
-from executorch.exir.backend.backend_api import to_backend, validation_disabled
-from executorch.exir.passes.spec_prop_pass import SpecPropPass
 
-from executorch.sdk.size_analysis_tool.size_analysis_tool import (
+from executorch.devtools.size_analysis_tool.size_analysis_tool import (
     generate_model_size_information,
 )
+from executorch.exir.backend.backend_api import to_backend, validation_disabled
+from executorch.exir.passes.spec_prop_pass import SpecPropPass
 
 
 class SizeAnalysisToolTest(unittest.TestCase):
diff --git a/sdk/targets.bzl b/devtools/targets.bzl
similarity index 76%
rename from sdk/targets.bzl
rename to devtools/targets.bzl
index 38c2e6e820e..17d9e89cad3 100644
--- a/sdk/targets.bzl
+++ b/devtools/targets.bzl
@@ -4,5 +4,5 @@ def build_sdk():
 def get_sdk_flags():
     sdk_flags = []
     if build_sdk():
-        sdk_flags += ["-DEXECUTORCH_BUILD_SDK"]
+        sdk_flags += ["-DEXECUTORCH_BUILD_DEVTOOLS"]
     return sdk_flags
diff --git a/docs/source/Doxyfile b/docs/source/Doxyfile
index b741509197d..e662105b83f 100644
--- a/docs/source/Doxyfile
+++ b/docs/source/Doxyfile
@@ -964,8 +964,7 @@ INPUT                  = ../runtime/executor/memory_manager.h \
                          ../runtime/core/tensor_shape_dynamism.h \
                          ../runtime/platform/compiler.h \
                          ../runtime/executor/ \
-                         ../runtime/platform/ \
-                         ../util/
+                         ../runtime/platform/
 
 
 
diff --git a/docs/source/_static/img/benchmark-infra.png b/docs/source/_static/img/benchmark-infra.png
new file mode 100644
index 00000000000..a5d30774257
Binary files /dev/null and b/docs/source/_static/img/benchmark-infra.png differ
diff --git a/docs/source/_static/img/chat.png b/docs/source/_static/img/chat.png
new file mode 100644
index 00000000000..e7ed934519d
Binary files /dev/null and b/docs/source/_static/img/chat.png differ
diff --git a/docs/source/_static/img/chat_response.png b/docs/source/_static/img/chat_response.png
new file mode 100644
index 00000000000..714265276fe
Binary files /dev/null and b/docs/source/_static/img/chat_response.png differ
diff --git a/docs/source/_static/img/ios_demo_app.jpg b/docs/source/_static/img/ios_demo_app.jpg
new file mode 100644
index 00000000000..076508d0e0d
Binary files /dev/null and b/docs/source/_static/img/ios_demo_app.jpg differ
diff --git a/docs/source/_static/img/ios_demo_app_choosing_package.png b/docs/source/_static/img/ios_demo_app_choosing_package.png
new file mode 100644
index 00000000000..20599d7ea80
Binary files /dev/null and b/docs/source/_static/img/ios_demo_app_choosing_package.png differ
diff --git a/docs/source/_static/img/ios_demo_app_llava.jpg b/docs/source/_static/img/ios_demo_app_llava.jpg
new file mode 100644
index 00000000000..316d68b71bd
Binary files /dev/null and b/docs/source/_static/img/ios_demo_app_llava.jpg differ
diff --git a/docs/source/_static/img/ios_demo_app_mps.jpg b/docs/source/_static/img/ios_demo_app_mps.jpg
new file mode 100644
index 00000000000..58114f869c6
Binary files /dev/null and b/docs/source/_static/img/ios_demo_app_mps.jpg differ
diff --git a/docs/source/_static/img/ios_demo_app_swift_pm.png b/docs/source/_static/img/ios_demo_app_swift_pm.png
new file mode 100644
index 00000000000..19e7a6726e1
Binary files /dev/null and b/docs/source/_static/img/ios_demo_app_swift_pm.png differ
diff --git a/docs/source/_static/img/llava_example.png b/docs/source/_static/img/llava_example.png
new file mode 100644
index 00000000000..ccac335ee65
Binary files /dev/null and b/docs/source/_static/img/llava_example.png differ
diff --git a/docs/source/_static/img/load_complete_and_start_prompt.png b/docs/source/_static/img/load_complete_and_start_prompt.png
new file mode 100644
index 00000000000..43d81f10d00
Binary files /dev/null and b/docs/source/_static/img/load_complete_and_start_prompt.png differ
diff --git a/docs/source/_static/img/logs.png b/docs/source/_static/img/logs.png
new file mode 100644
index 00000000000..e35227a1c0c
Binary files /dev/null and b/docs/source/_static/img/logs.png differ
diff --git a/docs/source/_static/img/mtk_changes_to_shell_file.png b/docs/source/_static/img/mtk_changes_to_shell_file.png
new file mode 100644
index 00000000000..7fa4e461863
Binary files /dev/null and b/docs/source/_static/img/mtk_changes_to_shell_file.png differ
diff --git a/docs/source/_static/img/mtk_output.png b/docs/source/_static/img/mtk_output.png
new file mode 100644
index 00000000000..e41d54c3561
Binary files /dev/null and b/docs/source/_static/img/mtk_output.png differ
diff --git a/docs/source/_static/img/opening_the_app_details.png b/docs/source/_static/img/opening_the_app_details.png
new file mode 100644
index 00000000000..60494ecc69d
Binary files /dev/null and b/docs/source/_static/img/opening_the_app_details.png differ
diff --git a/docs/source/_static/img/settings_menu.png b/docs/source/_static/img/settings_menu.png
new file mode 100644
index 00000000000..028e6b55cd7
Binary files /dev/null and b/docs/source/_static/img/settings_menu.png differ
diff --git a/docs/source/apple-runtime.md b/docs/source/apple-runtime.md
index 2378ddc2bd2..023903db3b2 100644
--- a/docs/source/apple-runtime.md
+++ b/docs/source/apple-runtime.md
@@ -19,6 +19,19 @@ Link your binary with the ExecuTorch runtime and any backends or kernels used by
 
 ## Integration
 
+### Setup
+
+#### CMake
+
+Building the Xcode project requires CMake. Installing via homebrew does not
+typically work; instead, install the packaged application and commandline tools
+globally:
+
+1. Download the macOS `.dmg` installer from https://cmake.org/download
+2. Open the `.dmg`
+3. Drag the CMake app to the `/Applications` folder
+4. In a terminal, install the command line tools: `sudo /Applications/CMake.app/Contents/bin/cmake-gui --install`
+
 ### Swift Package Manager
 
 The prebuilt ExecuTorch runtime, backend, and kernels are available as a [Swift PM](https://www.swift.org/documentation/package-manager/) package.
diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
index c774ae57b43..94a936b2e7a 100644
--- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
@@ -59,9 +59,7 @@ This example is verified with SM8550 and SM8450.
    - Click the "Get Software" button to download a version of QNN SDK.
    - However, at the moment of updating this tutorial, the above website doesn't provide QNN SDK newer than 2.22.6.
    - The below is public links to download various QNN versions. Hope they can be publicly discoverable soon.
-   - [QNN 2.25.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip)
-   - [QNN 2.24.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.24.0.240626.zip)
-   - [QNN 2.23.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.23.0.24.06.24.zip)
+   - [QNN 2.26.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.26.0.240828.zip)
 
 The directory with installed Qualcomm AI Engine Direct SDK looks like:
 ```
@@ -126,16 +124,17 @@ Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct b
 
 ```bash
 cd $EXECUTORCH_ROOT
-mkdir cmake-out
-cd cmake-out
+mkdir build-x86
+cd build-x86
 # Note that the below command might change.
 # Please refer to the above build.sh for latest workable commands.
 cmake .. \
   -DCMAKE_INSTALL_PREFIX=$PWD \
   -DEXECUTORCH_BUILD_QNN=ON \
   -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
-  -DEXECUTORCH_BUILD_SDK=ON \
+  -DEXECUTORCH_BUILD_DEVTOOLS=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
   -DPYTHON_EXECUTABLE=python3 \
   -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF
@@ -158,15 +157,16 @@ Commands to build `qnn_executor_runner` for Android:
 
 ```bash
 cd $EXECUTORCH_ROOT
-mkdir cmake-out-android
-cd cmake-out-android
+mkdir build-android
+cd build-android
 # build executorch & qnn_executorch_backend
 cmake .. \
     -DCMAKE_INSTALL_PREFIX=$PWD \
     -DEXECUTORCH_BUILD_QNN=ON \
     -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
-    -DEXECUTORCH_BUILD_SDK=ON \
+    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
     -DPYTHON_EXECUTABLE=python3 \
     -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
@@ -189,7 +189,7 @@ cmake ../examples/qualcomm \
 cmake --build examples/qualcomm -j$(nproc)
 
 # qnn_executor_runner can be found under examples/qualcomm
-# The full path is $EXECUTORCH_ROOT/cmake-out-android/examples/qualcomm/qnn_executor_runner
+# The full path is $EXECUTORCH_ROOT/build-android/examples/qualcomm/qnn_executor_runner
 ls examples/qualcomm
 ```
 
@@ -209,7 +209,7 @@ cd $EXECUTORCH_ROOT
 cp schema/program.fbs exir/_serialize/program.fbs
 cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
 
-python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8550 --compile_only --download
+python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8550 --compile_only --download
 ```
 
 You might see something like below:
@@ -239,7 +239,7 @@ We can test model inferences before deploying it to a device by HTP emulator.
 Let's build `qnn_executor_runner` for a x64 host:
 ```bash
 # assuming the AOT component is built.
-cd $EXECUTORCH_ROOT/cmake-out
+cd $EXECUTORCH_ROOT/build-x86
 cmake ../examples/qualcomm \
   -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \
   -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
@@ -249,14 +249,14 @@ cmake ../examples/qualcomm \
 cmake --build examples/qualcomm -j$(nproc)
 
 # qnn_executor_runner can be found under examples/qualcomm
-# The full path is $EXECUTORCH_ROOT/cmake-out/examples/qualcomm/qnn_executor_runner
+# The full path is $EXECUTORCH_ROOT/build-x86/examples/qualcomm/qnn_executor_runner
 ls examples/qualcomm/
 ```
 
 To run the HTP emulator, the dynamic linker need to access QNN libraries and `libqnn_executorch_backend.so`.
 We set the below two paths to `LD_LIBRARY_PATH` environment variable:
   1. `$QNN_SDK_ROOT/lib/x86_64-linux-clang/`
-  2. `$EXECUTORCH_ROOT/cmake-out/lib/`
+  2. `$EXECUTORCH_ROOT/build-x86/lib/`
 
 The first path is for QNN libraries including HTP emulator. It has been configured in the AOT compilation section.
 
@@ -264,8 +264,8 @@ The second path is for `libqnn_executorch_backend.so`.
 
 So, we can run `./deeplab_v3/dlv3_qnn.pte` by:
 ```bash
-cd $EXECUTORCH_ROOT/cmake-out
-export LD_LIBRARY_PATH=$EXECUTORCH_ROOT/cmake-out/lib/:$LD_LIBRARY_PATH
+cd $EXECUTORCH_ROOT/build-x86
+export LD_LIBRARY_PATH=$EXECUTORCH_ROOT/build-x86/lib/:$LD_LIBRARY_PATH
 examples/qualcomm/qnn_executor_runner --model_path ../deeplab_v3/dlv3_qnn.pte
 ```
 
@@ -308,8 +308,8 @@ So, we can run `qnn_executor_runner` like
 
 ```bash
 adb push ./deeplab_v3/dlv3_qnn.pte ${DEVICE_DIR}
-adb push ${EXECUTORCH_ROOT}/cmake-out-android/examples/qualcomm/executor_runner/qnn_executor_runner ${DEVICE_DIR}
-adb push ${EXECUTORCH_ROOT}/cmake-out-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
+adb push ${EXECUTORCH_ROOT}/build-android/examples/qualcomm/executor_runner/qnn_executor_runner ${DEVICE_DIR}
+adb push ${EXECUTORCH_ROOT}/build-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
 adb shell "cd ${DEVICE_DIR} \
            && export LD_LIBRARY_PATH=${DEVICE_DIR} \
            && export ADSP_LIBRARY_PATH=${DEVICE_DIR} \
@@ -333,7 +333,7 @@ I 00:00:00.364875 executorch:qnn_executor_runner.cpp:425] Write etdump to etdump
 The model is merely executed. If we want to feed real inputs and get model outputs, we can use
 ```bash
 cd $EXECUTORCH_ROOT
-python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8550 --download -s <device_serial>
+python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8550 --download -s <device_serial>
 ```
 The `<device_serial>` can be found by `adb devices` command.
 
@@ -354,7 +354,7 @@ Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `EXECUTORCH_RO
 
 ## What is coming?
 
- - [llama2 and llama3](https://github.com/pytorch/executorch/pull/4030). Note that at the moment of writing, we still suffer from the quantization issue in llama2-7B and llama3-8B cases. Only storiesllama works well.
+ - Improve the performance for llama3-8B-Instruct and support batch prefill.
  - We will support pre-compiled binaries from [Qualcomm AI Hub](https://aihub.qualcomm.com/).
 
 ## FAQ
diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md
index fa41ec93c9d..c82af7d98fe 100644
--- a/docs/source/compiler-delegate-and-partitioner.md
+++ b/docs/source/compiler-delegate-and-partitioner.md
@@ -127,13 +127,13 @@ static auto success_with_compiler = register_backend(backend);
 ```
 
 
-## SDK Integration: Debuggability
+## Developer Tools Integration: Debuggability
 
-Providing consistent debugging experience, be it for runtime failures or performance profiling, is important. ExecuTorch employs native SDK (Software Development Kit) for this purpose, which enables correlating program instructions to original PyTorch code, via debug handles. You can read more about it [here](./sdk-etrecord).
+Providing consistent debugging experience, be it for runtime failures or performance profiling, is important. ExecuTorch employs native Developer Tools for this purpose, which enables correlating program instructions to original PyTorch code, via debug handles. You can read more about it [here](./sdk-etrecord).
 
-Delegated program or subgraphs are opaque to ExecuTorch runtime and appear as a special `call_delegate` instruction, which asks corresponding backend to handle the execution of the subgraph or program. Due to the opaque nature of backend delgates, native SDK does not have visibility into delegated program. Thus the debugging, functional or performance, experiences of delegated execution suffers significantly as compared to it's non-delegated counterpart.
+Delegated program or subgraphs are opaque to ExecuTorch runtime and appear as a special `call_delegate` instruction, which asks corresponding backend to handle the execution of the subgraph or program. Due to the opaque nature of backend delgates, native Developer Tools does not have visibility into delegated program. Thus the debugging, functional or performance, experiences of delegated execution suffers significantly as compared to it's non-delegated counterpart.
 
-In order to provide consistent debugging experience to users, regardless of the use of delegation for a model, SDK provides an interface to correlate delegated (sub)graph to original (sub)graph. The SDK does so via debug handles map which allows delegates to generate internal handles that can be associated with the original (sub)graph consumed by the delegate. Then at runtime, backend developer can report error or profiling information using the internal handle, which will be mapped to original (sub)graph using the debug handle map. For more information, please refer to [SDK delegate integration](./sdk-delegate-integration).
+In order to provide consistent debugging experience to users, regardless of the use of delegation for a model, Developer Tools provide an interface to correlate delegated (sub)graph to original (sub)graph. The Developer Tools do so via debug handles map which allows delegates to generate internal handles that can be associated with the original (sub)graph consumed by the delegate. Then at runtime, backend developer can report error or profiling information using the internal handle, which will be mapped to original (sub)graph using the debug handle map. For more information, please refer to [Developer Tools Delegate Integration](./sdk-delegate-integration).
 
 By leveraging the debug identifier, backend developer can embed the debug as part of the delegated blob
 
diff --git a/docs/source/compiler-memory-planning.md b/docs/source/compiler-memory-planning.md
index 1dad3b032fc..fcad2eca58b 100644
--- a/docs/source/compiler-memory-planning.md
+++ b/docs/source/compiler-memory-planning.md
@@ -32,7 +32,6 @@ The `MemoryPlanningPass` exposes the option to not memory plan program inputs an
 program = edge_program.to_executorch(
             exir.ExecutorchBackendConfig(
                 memory_planning_pass=MemoryPlanningPass(
-                    memory_planning_algo="greedy",
                     alloc_graph_input=False, # Inputs will not be memory planned, the data_ptr for input tensors after model load will be nullptr
                     alloc_graph_output=True, # Outputs will be memory planned, the data_ptr for input tensors after model load will be in the `planned_memory`.
                 )
@@ -77,7 +76,7 @@ Then later when lowering to ExecuTorch you can use your custom plan in the follo
 program = edge_program.to_executorch(
             exir.ExecutorchBackendConfig(
                 memory_planning_pass=CustomPoolMemoryPlanningPass(
-                    memory_planning_algo="greedy",
+                    memory_planning_algo=greedy,
                 )
             )
         )
diff --git a/docs/source/concepts.md b/docs/source/concepts.md
index 33d944c376a..c085505b61a 100644
--- a/docs/source/concepts.md
+++ b/docs/source/concepts.md
@@ -283,9 +283,9 @@ Techniques for performing computations and memory accesses on tensors with lower
 
 The ExecuTorch runtime executes models on edge devices. It is responsible for program initialization, program execution and, optionally, destruction (releasing backend owned resources).
 
-## [SDK](./sdk-overview.md)
+## [Developer Tools](./devtools-overview.md)
 
-Software Development Kit. The tooling users need to profile, debug and visualize programs that are running with ExecuTorch.
+A collection of tools users need to profile, debug and visualize programs that are running with ExecuTorch.
 
 ## [Selective build](./kernel-library-selective-build.md)
 
diff --git a/docs/source/devtools-overview.md b/docs/source/devtools-overview.md
new file mode 100644
index 00000000000..13fd8e00597
--- /dev/null
+++ b/docs/source/devtools-overview.md
@@ -0,0 +1,44 @@
+# Introduction to the ExecuTorch Developer Tools
+
+ExecuTorch has been designed with [productivity](./intro-overview.md) as one of its core objectives and the ExecuTorch Developer Tools enable this through the comprehensive suite of tools it provides users to help them profile, debug, and visualize models that they have onboarded onto ExecuTorch.
+
+All the components of the Developer Tools have been designed from the ground up with deep integration in both the export process and the runtime. This enables us to provide unique features such as linking back operator execution in the runtime to the line of code in the original eager model that this operator originated from.
+
+## Developer Tools Features
+
+The ExecuTorch Developer Tools support the following features:
+
+- **BundledProgram** is a utility tool for exporting the model bundled with a sample set of (representative) inputs and expected outputs, so that during runtime users can validate that the actual output is in fact the same as the expected output.
+- **Profiling** models with operator level breakdown of performance stats
+    - Linking back operator performance stats to source code and module hierarchy
+    - Model loading and execution time
+- **Delegate Integration** - Surfacing performance details from delegate backends
+    - Link back delegate operator execution to the nodes they represent in the edge dialect graph (and subsequently linking back to source code and module hierarchy)
+- **Debugging** - Intermediate outputs and output quality analysis
+- **Visualization** - Coming soon
+
+## Fundamental components of the Developer Tools
+
+In order to fully understand and leverage the power of the Developer Tools in this section, the fundamental components that power the Developer Tools will be detailed.
+
+### ETRecord
+ETRecord (ExecuTorch Record) is an artifact generated during the export process that stores the graphs and other metadata that is critical for the Developer Tools to be able to link back the performance/debug data sourced from the runtime to the source code of the eager model.
+
+To draw a rough equivalence to conventional software development ETRecord can be considered as the binary built with debug symbols that is used for debugging in GNU Project debugger (gdb).
+
+More details are available in the [ETRecord documentation](sdk-etrecord.rst) on how to generate and store an ETRecord.
+
+### ETDump
+ETDump (ExecuTorch Dump) is the binary blob that is generated by the runtime after running a model. Similarly as above, to draw a rough equivalence to conventional software development, ETDump can be considered as the coredump of ExecuTorch, but in this case within ETDump we store all the performance and debug data that was generated by the runtime during model execution.
+
+```{note}
+If you only care about looking at the raw performance data without linking back to source code and other extensive features, an ETDump alone will be enough to leverage the basic features of the Developer Tools. For the full experience, it is recommended that the users also generate an ETRecord.
+```
+
+More details are available in the [ETDump documentation](sdk-etdump.md) on how to generate and store an ETDump from the runtime.
+
+
+### Inspector APIs
+The Inspector Python APIs are the main user enrty point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API.
+
+More details are available in the [Inspector API documentation](sdk-inspector.rst) on how to use the Inspector APIs.
diff --git a/docs/source/devtools-tutorial.md b/docs/source/devtools-tutorial.md
new file mode 100644
index 00000000000..33d78cf58da
--- /dev/null
+++ b/docs/source/devtools-tutorial.md
@@ -0,0 +1,3 @@
+## Developer Tools Usage Tutorial
+
+Please refer to the [Developer Tools tutorial](./tutorials/devtools-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools.
diff --git a/docs/source/extension-module.md b/docs/source/extension-module.md
index 9e236e8e489..7516184d1cc 100644
--- a/docs/source/extension-module.md
+++ b/docs/source/extension-module.md
@@ -22,7 +22,7 @@ Tensor::SizesType sizes[] = {1, 3, 256, 256};
 TensorImpl tensor(ScalarType::Float, std::size(sizes), sizes, input);
 
 // Perform an inference.
-const auto result = module.forward({EValue(Tensor(&tensor))});
+const auto result = module.forward(Tensor(&tensor));
 
 // Check for success or failure.
 if (result.ok()) {
@@ -105,13 +105,13 @@ Note: `method_meta()` will try to force-load the `Method` when called for the fi
 Assuming that the `Program`'s method names and their input format is known ahead of time, we rarely need to query for those and can run the methods directly by name using the `execute()` function:
 
 ```cpp
-const auto result = module.execute("forward", {EValue(Tensor(&tensor))});
+const auto result = module.execute("forward", Tensor(&tensor));
 ```
 
 Which can also be simplified for the standard `forward()` method name as:
 
 ```cpp
-const auto result = module.forward({EValue(Tensor(&tensor))});
+const auto result = module.forward(Tensor(&tensor));
 ```
 
 Note: `execute()` or `forward()` will try to force load the `Program` and the `Method` when called for the first time. Therefore, the first inference will take more time than subsequent ones as it loads the model lazily and prepares it for execution unless the `Program` or `Method` was loaded explicitly earlier using the corresponding functions.
@@ -132,7 +132,7 @@ Use [ExecuTorch Dump](sdk-etdump.md) to trace model execution. Create an instanc
 #include <fstream>
 #include <memory>
 #include <executorch/extension/module/module.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 
 using namespace ::torch::executor;
 
diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md
index 2c3f85aff17..937b5b389f5 100644
--- a/docs/source/getting-started-architecture.md
+++ b/docs/source/getting-started-architecture.md
@@ -87,8 +87,8 @@ The ExecuTorch runtime is written in C++ with minimal dependencies for portabili
 
 _Executor_ is the entry point to load the program and execute it. The execution triggers corresponding operator kernels or backend execution from this very minimal runtime.
 
-## SDK
+## Developer Tools
 
-It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch SDK](./sdk-overview.md) to improve productivity. The SDK is not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
+It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](./devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
 
-During the program preparation and execution, users can use the ExecuTorch SDK to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments.
+During the program preparation and execution, users can use the ExecuTorch Developer Tools to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments.
diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md
index d610f020ef2..15fa084e33f 100644
--- a/docs/source/getting-started-setup.md
+++ b/docs/source/getting-started-setup.md
@@ -59,13 +59,11 @@ also work in similar environments.
   - We recommend `conda` as it provides cross-language
     support and integrates smoothly with `pip` (Python's built-in package manager)
   - Otherwise, Python's built-in virtual environment manager `python venv` is a good alternative.
-* `g++` version 8 or higher, `clang++` version 8 or higher, or another
-  C++17-compatible toolchain that supports GNU C-style [statement
-  expressions](https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html) (`({ ...
-  })` syntax).
+* `g++` version 7 or higher, `clang++` version 5 or higher, or another
+  C++17-compatible toolchain.
 
 Note that the cross-compilable core runtime code supports a wider range of
-toolchains, down to C++11. See the [Runtime Overview](./runtime-overview.md) for
+toolchains, down to C++17. See the [Runtime Overview](./runtime-overview.md) for
 portability details.
 
 ## Quick Setup: Colab/Jupyter Notebook Prototype
@@ -112,6 +110,23 @@ Alternatively, if you would like to experiment with ExecuTorch quickly and easil
    ```
 After setting up your environment, you are ready to convert your PyTorch programs
 to ExecuTorch.
+
+> **_NOTE:_**  Cleaning the build system
+>
+> When fetching a new version of the upstream repo (via `git fetch` or `git
+> pull`) it is a good idea to clean the old build artifacts. The build system
+> does not currently adapt well to changes in build dependencies.
+>
+> You should also update and pull the submodules again, in case their versions
+> have changed.
+>
+> ```bash
+> # From the root of the executorch repo:
+> rm -rf cmake-out pip-out
+> git submodule sync
+> git submodule update --init
+> ```
+
 ## Create an ExecuTorch program
 
 After setting up your environment, you are ready to convert your PyTorch programs
@@ -171,13 +186,30 @@ For now, let's use [`executor_runner`](https://github.com/pytorch/executorch/blo
 ### Build Tooling Setup
 The ExecuTorch repo uses CMake to build its C++ code. Here, we'll configure it to build the `executor_runner` tool to run it on our desktop OS.
   ```bash
-  # Clean and configure the CMake build system. Compiled programs will appear in the executorch/cmake-out directory we create here.
+  # Clean and configure the CMake build system. Compiled programs will
+  # appear in the executorch/cmake-out directory we create here.
   (rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake ..)
 
   # Build the executor_runner target
   cmake --build cmake-out --target executor_runner -j9
   ```
 
+> **_NOTE:_**  Cleaning the build system
+>
+> When fetching a new version of the upstream repo (via `git fetch` or `git
+> pull`) it is a good idea to clean the old build artifacts. The build system
+> does not currently adapt well to changes in build dependencies.
+>
+> You should also update and pull the submodules again, in case their versions
+> have changed.
+>
+> ```bash
+> # From the root of the executorch repo:
+> rm -rf cmake-out pip-out
+> git submodule sync
+> git submodule update --init
+> ```
+
 ### Run Your Program
 
 Now that we've exported a program and built the runtime, let's execute it!
diff --git a/docs/source/index.rst b/docs/source/index.rst
index d8955c513e4..d49fd43e31b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -94,7 +94,7 @@ Topics in this section will help you get started with ExecuTorch.
    tutorials/export-to-executorch-tutorial
    running-a-model-cpp-tutorial
    extension-module
-   tutorials/sdk-integration-tutorial
+   tutorials/devtools-integration-tutorial
    apple-runtime
    demo-apps-ios
    demo-apps-android
@@ -117,6 +117,9 @@ Topics in this section will help you get started with ExecuTorch.
    :hidden:
 
    llm/getting-started
+   llm/llama-demo-android
+   llm/build-run-llama3-qualcomm-ai-engine-direct-backend
+   llm/llama-demo-ios
 
 .. toctree::
    :glob:
@@ -193,10 +196,10 @@ Topics in this section will help you get started with ExecuTorch.
 .. toctree::
    :glob:
    :maxdepth: 1
-   :caption: SDK
+   :caption: Developer Tools
    :hidden:
 
-   sdk-overview
+   devtools-overview
    sdk-bundled-io
    sdk-etrecord
    sdk-etdump
@@ -204,7 +207,7 @@ Topics in this section will help you get started with ExecuTorch.
    sdk-debugging
    sdk-inspector
    sdk-delegate-integration
-   sdk-tutorial
+   devtools-tutorial
 
 .. toctree::
    :glob:
@@ -244,11 +247,11 @@ ExecuTorch tutorials.
    :tags:
 
 .. customcarditem::
-   :header: Using the ExecuTorch SDK to Profile a Model
-   :card_description: A tutorial for using the ExecuTorch SDK to profile and analyze a model with linkage back to source code.
+   :header: Using the ExecuTorch Developer Tools to Profile a Model
+   :card_description: A tutorial for using the ExecuTorch Developer Tools to profile and analyze a model with linkage back to source code.
    :image: _static/img/generic-pytorch-logo.png
-   :link: tutorials/sdk-integration-tutorial.html
-   :tags: SDK
+   :link: tutorials/devtools-integration-tutorial.html
+   :tags: devtools
 
 .. customcarditem::
    :header: Integrating and Running ExecuTorch on Apple Platforms
diff --git a/docs/source/intro-overview.md b/docs/source/intro-overview.md
index f80caff4679..96c7982b8fe 100644
--- a/docs/source/intro-overview.md
+++ b/docs/source/intro-overview.md
@@ -10,9 +10,9 @@ Key value propositions of ExecuTorch are:
 - **Portability:** Compatibility with a wide variety of computing platforms,
   from high-end mobile phones to highly constrained embedded systems and
   microcontrollers.
-- **Productivity:** Enabling developers to use the same toolchains and SDK from
-  PyTorch model authoring and conversion, to debugging and deployment to a wide
-  variety of platforms.
+- **Productivity:** Enabling developers to use the same toolchains and Developer
+  Tools from PyTorch model authoring and conversion, to debugging and deployment
+  to a wide variety of platforms.
 - **Performance:** Providing end users with a seamless and high-performance
   experience due to a lightweight runtime and utilizing full hardware
   capabilities such as CPUs, NPUs, and DSPs.
diff --git a/docs/source/kernel-library-custom-aten-kernel.md b/docs/source/kernel-library-custom-aten-kernel.md
index 8fb4ed96cd5..0f060d1c5e5 100644
--- a/docs/source/kernel-library-custom-aten-kernel.md
+++ b/docs/source/kernel-library-custom-aten-kernel.md
@@ -3,23 +3,49 @@
 
 At the last stage of [ExecuTorch model exporting](./export-overview.md), we lower the operators in the dialect to the _out variants_ of the [core ATen operators](./ir-ops-set-definition.md). Then we serialize these operator names into the model artifact. During runtime execution, for each operator name we will need to find the actual _kernels_, i.e., the C++ functions that do the heavy-lifting calculations and return results.
 
-Portable kernel library is the in-house default kernel library, it’s easy to use and portable for most of the target backends. However it’s not optimized for performance, because it’s not specialized for any certain target. Therefore we provide kernel registration APIs for ExecuTorch users to easily register their own optimized kernels.
+## Kernel Libraries
+### First-party kernel libraries:
 
+**[Portable kernel library](https://github.com/pytorch/executorch/tree/main/kernels/portable)** is the in-house default kernel library that covers most of the core ATen operators. It’s easy to use/read and is written in portable C++17. However it’s not optimized for performance, because it’s not specialized for any certain target. Therefore we provide kernel registration APIs for ExecuTorch users to easily register their own optimized kernels.
 
-## Design Principles
+**[Optimized kernel library](https://github.com/pytorch/executorch/tree/main/kernels/optimized)** specializes on performance for some of the operators, leveraging existing third party libraries such as [EigenBLAS](https://gitlab.com/libeigen/eigen). This works best along with the portable kernel library, with a good balance on portability and performance. One example of combining these two libraries can be found [here](https://github.com/pytorch/executorch/blob/main/configurations/CMakeLists.txt).
 
-**What do we support?** On the operator coverage side, the kernel registration APIs allow users to register kernels for all core ATen ops as well as custom ops, as long as the custom ops schemas are specified.
+**[Quantized kernel library](https://github.com/pytorch/executorch/tree/main/kernels/quantized)** implements operators for quantization and dequantization. These are out of core ATen operators but are vital to most of the production use cases.
 
-Notice that we also support _partial kernels, _for example the kernel only supports a subset of tensor dtypes and/or dim orders.
+### Custom kernel libraries:
 
-**Kernel contract**: kernels need to comply with the following requirements:
+**Custom kernels implementing core ATen ops**. Even though we don't have an internal example for custom kernels for core ATen ops, the optimized kernel library can be viewed as a good example. We have optimized [`add.out`](https://github.com/pytorch/executorch/blob/main/kernels/optimized/cpu/op_add.cpp) and a portable [`add.out`](https://github.com/pytorch/executorch/blob/main/kernels/portable/cpu/op_add.cpp). When user is combining these two libraries, we provide APIs to choose which kernel to use for `add.out`. In order to author and use custom kernels implementing core ATen ops, using the [YAML based approach](#yaml-entry-for-core-aten-op-out-variant) is recommended, because it provides full fledged support on
+  1. combining kernel libraries and define fallback kernels;
+  2. using selective build to minimize the kernel size.
+
+A **[Custom operator](https://github.com/pytorch/executorch/tree/main/extension/llm/custom_ops)** is any operator that an ExecuTorch user defines outside of PyTorch's [`native_functions.yaml`](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml).
+
+## Operator & Kernel Contract
+
+All the kernels mentioned above, whether they are in-house or customized, should comply with the following requirements:
 
 * Match the calling convention derived from operator schema. The kernel registration API will generate headers for the custom kernels as references.
-* Satisfy the dtype constraints defined in edge dialect. For tensors with certain dtypes as arguments, the result of a custom kernel needs to match  the expected dtypes. The constraints are available in edge dialect ops.
-* Gives correct result. We will provide a testing framework to automatically test the custom kernels.
+* Satisfy the dtype constraints defined in edge dialect. For tensors with certain dtypes as arguments, the result of a custom kernel needs to match the expected dtypes. The constraints are available in edge dialect ops.
+* Give correct result. We will provide a testing framework to automatically test the custom kernels.
+
+
+## APIs
+
+These are the APIs available to register kernels/custom kernels/custom ops into ExecuTorch:
+
+* [YAML Entry API](#yaml-entry-api-high-level-architecture)
+  - [for core ATen op with custom kernels](#yaml-entry-api-for-core-aten-op-out-variant)
+  - [for custom ops](#yaml-entry-api-for-custom-ops)
+  - [CMake Macros](#cmake-macros)
+* C++ API
+  - [for custom ops](#c-api-for-custom-ops)
+  - [CMake Example](#compile-and-link-the-custom-kernel)
+
+If it's not clear which API to use, please see [Best Practices](#custom-ops-api-best-practices).
+
 
 
-## High Level Architecture
+### YAML Entry API High Level Architecture
 
 ![](./_static/img/kernel-library-custom-aten-kernel.png)
 
@@ -27,10 +53,10 @@ ExecuTorch users are asked to provide:
 
 1. the custom kernel library with C++ implementations
 
-2. a yaml file associated with the library that describes what operators are being implemented by this library. For partial kernels, the yaml file also contains information on the dtypes and dim orders supported by the  kernel. More details in the API section.
+2. a YAML file associated with the library that describes what operators are being implemented by this library. For partial kernels, the yaml file also contains information on the dtypes and dim orders supported by the  kernel. More details in the API section.
 
 
-### Workflow
+### YAML Entry API Workflow
 
 At build time, the yaml files associated with kernel libraries will be passed to the _kernel resolver_ along with the model op info (see selective build doc) and the outcome is a mapping between a combination of operator names and tensor metadata, to kernel symbols. Then codegen tools will use this mapping to generate C++ bindings that connect the kernels to ExecuTorch runtime. ExecuTorch users need to link this generated library into their application to use these kernels.
 
@@ -38,18 +64,10 @@ At static object initialization time, kernels will be registered into the ExecuT
 
 At runtime initialization stage, ExecuTorch will use the operator name and argument metadata as a key to lookup for the kernels. For example, with “aten::add.out” and inputs being float tensors with dim order (0, 1, 2, 3), ExecuTorch will go into the kernel registry and lookup for a kernel that matches the name and the input metadata.
 
-
-## APIs
-
-There are two sets of APIs: yaml files that describe kernel - operator mappings and codegen tools to consume these mappings.
-
-
-### Yaml Entry for Core ATen Op Out Variant
+### YAML Entry API for Core ATen Op Out Variant
 
 Top level attributes:
 
-
-
 * `op` (if the operator appears in `native_functions.yaml`) or `func` for custom operator. The value for this key needs to be the full operator name (including overload name) for `op` key, or a full operator schema (namespace, operator name, operator overload name and schema string), if we are describing a custom operator. For schema syntax please refer to this [instruction](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md).
 * `kernels`: defines kernel information. It consists of `arg_meta` and `kernel_name`, which are bound together to describe "for input tensors with these metadata, use this kernel".
 * `type_alias`(optional): we are giving aliases to possible dtype options. `T0: [Double, Float]` means `T0` can be one of `Double` or `Float`.
@@ -86,86 +104,9 @@ ATen operator with a dtype/dim order specialized kernel (works for `Double` dtyp
       kernel_name: torch::executor::add_out
 
 ```
-### Custom Ops C++ API
-
-For a custom kernel that implements a custom operator, we provides 2 ways to register it into ExecuTorch runtime:
-1. Using `EXECUTORCH_LIBRARY` and `WRAP_TO_ATEN` C++ macros, covered by this section.
-2. Using `functions.yaml` and codegen'd C++ libraries, covered by [next section](#custom-ops-yaml-entry).
-
-Please refer to [Custom Ops Best Practices](#custom-ops-api-best-practices) on which API to use.
-
-The first option requires C++17 and doesn't have selective build support yet, but it's faster than the second option where we have to go through yaml authoring and build system tweaking.
-
-The first option is particularly suitable for fast prototyping but can also be used in production.
-
-Similar to `TORCH_LIBRARY`, `EXECUTORCH_LIBRARY` takes the operator name and the C++ function name and register them into ExecuTorch runtime.
-
-#### Prepare custom kernel implementation
-
-Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see native_functions.yaml). For example:
-
-```yaml
-custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor
-custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)
-```
-
-Then write your custom kernel according to the schema using ExecuTorch types, along with APIs to register to ExecuTorch runtime:
-
-
-```c++
-// custom_linear.h/custom_linear.cpp
-#include <executorch/runtime/kernel/kernel_includes.h>
-Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional<Tensor> bias, Tensor& out) {
-   // calculation
-   return out;
-}
-```
-#### Use a C++ macro to register it into PyTorch & ExecuTorch
-
-Append the following line in the example above:
-```c++
-// custom_linear.h/custom_linear.cpp
-// opset namespace myop
-EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out);
-```
-
-Now we need to write some wrapper for this op to show up in PyTorch, but don’t worry we don’t need to rewrite the kernel. Create a separate .cpp for this purpose:
-
-```c++
-// custom_linear_pytorch.cpp
-#include "custom_linear.h"
-#include <torch/library.h>
-
-at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional<at::Tensor> bias) {
-    // initialize out
-    at::Tensor out = at::empty({weight.size(1), input.size(1)});
-    // wrap kernel in custom_linear.cpp into ATen kernel
-    WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out);
-    return out;
-}
-// standard API to register ops into PyTorch
-TORCH_LIBRARY(myop, m) {
-    m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear);
-    m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3));
-}
-```
-
-#### Compile and link the custom kernel
-
-Link it into ExecuTorch runtime: In our `CMakeLists.txt`` that builds the binary/application, we just need to add custom_linear.h/cpp into the binary target. We can build a dynamically loaded library (.so or .dylib) and link it as well.
-
-Link it into PyTorch runtime: We need to package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into our python environment. One way of doing this is:
-
-```python
-import torch
-torch.ops.load_library("libcustom_linear.so/dylib")
-
-# Now we have access to the custom op, backed by kernel implemented in custom_linear.cpp.
-op = torch.ops.myop.custom_linear.default
-```
 
 
-### Custom Ops Yaml Entry
+### YAML Entry API for Custom Ops
 
 As mentioned above, this option provides more support in terms of selective build and features such as merging operator libraries.
 
@@ -215,14 +156,11 @@ ExecuTorch does not support all of the argument types that core PyTorch supports
 * List<Optional<Type>>
 * Optional<List<Type>>
 
-
-### Build Tool Macros
+#### CMake Macros
 
 We provide build time macros to help users to build their kernel registration library. The macro takes the yaml file describing the kernel library as well as model operator metadata, and packages the generated C++ bindings into a C++ library. The macro is available on CMake.
 
 
-#### CMake
-
 `generate_bindings_for_kernels(FUNCTIONS_YAML functions_yaml CUSTOM_OPS_YAML custom_ops_yaml)` takes a yaml file for core ATen op out variants and also a yaml file for custom ops, generate C++ bindings for kernel registration. It also depends on the selective build artifact generated by `gen_selected_ops()`, see selective build doc for more information. Then `gen_operators_lib` will package those bindings to be a C++ library. As an example:
 ```cmake
 # SELECT_OPS_LIST: aten::add.out,aten::mm.out
@@ -263,6 +201,103 @@ And out fallback:
 
 The merged yaml will have the entry in functions.yaml.
 
+### C++ API for Custom Ops
+
+Unlike the YAML entry API, the C++ API only uses C++ macros `EXECUTORCH_LIBRARY` and `WRAP_TO_ATEN` for kernel registration, also without selective build support. It makes this API faster in terms of development speed, since users don't have to do YAML authoring and build system tweaking.
+
+Please refer to [Custom Ops Best Practices](#custom-ops-api-best-practices) on which API to use.
+
+Similar to [`TORCH_LIBRARY`](https://pytorch.org/cppdocs/library.html#library_8h_1a0bd5fb09d25dfb58e750d712fc5afb84) in PyTorch, `EXECUTORCH_LIBRARY` takes the operator name and the C++ function name and register them into ExecuTorch runtime.
+
+#### Prepare custom kernel implementation
+
+Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see `native_functions.yaml`). For example:
+
+```yaml
+custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor
+custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)
+```
+
+Then write your custom kernel according to the schema using ExecuTorch types, along with APIs to register to ExecuTorch runtime:
+
+
+```c++
+// custom_linear.h/custom_linear.cpp
+#include <executorch/runtime/kernel/kernel_includes.h>
+Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional<Tensor> bias, Tensor& out) {
+   // calculation
+   return out;
+}
+```
+#### Use a C++ macro to register it into ExecuTorch
+
+Append the following line in the example above:
+```c++
+// custom_linear.h/custom_linear.cpp
+// opset namespace myop
+EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out);
+```
+
+Now we need to write some wrapper for this op to show up in PyTorch, but don’t worry we don’t need to rewrite the kernel. Create a separate .cpp for this purpose:
+
+```c++
+// custom_linear_pytorch.cpp
+#include "custom_linear.h"
+#include <torch/library.h>
+
+at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional<at::Tensor> bias) {
+    // initialize out
+    at::Tensor out = at::empty({weight.size(1), input.size(1)});
+    // wrap kernel in custom_linear.cpp into ATen kernel
+    WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out);
+    return out;
+}
+// standard API to register ops into PyTorch
+TORCH_LIBRARY(myop, m) {
+    m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear);
+    m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3));
+}
+```
+
+#### Compile and link the custom kernel
+
+Link it into ExecuTorch runtime: In our `CMakeLists.txt` that builds the binary/application, we need to add custom_linear.h/cpp into the binary target. We can build a dynamically loaded library (.so or .dylib) and link it as well.
+
+Here's an example to do it:
+
+```cmake
+# For target_link_options_shared_lib
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+
+# Add a custom op library
+add_library(custom_op_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/custom_op.cpp)
+
+# Include the header
+target_include_directory(custom_op_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+# Link ExecuTorch library
+target_link_libraries(custom_op_lib PUBLIC executorch)
+
+# Define a binary target
+add_executable(custom_op_runner PUBLIC main.cpp)
+
+# Link this library with --whole-archive !! IMPORTANT !! this is to avoid the operators being stripped by linker
+target_link_options_shared_lib(custom_op_lib)
+
+# Link custom op lib
+target_link_libraries(custom_op_runner PUBLIC custom_op_lib)
+
+```
+
+Link it into the PyTorch runtime: We need to package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into our python environment. One way of doing this is:
+
+```python
+import torch
+torch.ops.load_library("libcustom_linear.so/dylib")
+
+# Now we have access to the custom op, backed by kernel implemented in custom_linear.cpp.
+op = torch.ops.myop.custom_linear.default
+```
 
 ### Custom Ops API Best Practices
 
diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
new file mode 100644
index 00000000000..ac95fb21bd8
--- /dev/null
+++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
@@ -0,0 +1,128 @@
+# Building and Running Llama 3 8B Instruct with Qualcomm AI Engine Direct Backend
+
+This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Engine Direct Backend and running the model on a Qualcomm device.
+
+## Prerequisites
+
+- Set up your ExecuTorch repo and environment if you haven’t done so by following [the Setting up ExecuTorch](../getting-started-setup.md) to set up the repo and dev environment.
+- Read [the Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend page](../build-run-qualcomm-ai-engine-direct-backend.md) to understand how to export and run a model with Qualcomm AI Engine Direct Backend on Qualcomm device.
+- Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama2) to know how to run a llama model on mobile via ExecuTorch.
+- A Qualcomm device with 16GB RAM
+  - We are continuing to optimize our memory usage to ensure compatibility with lower memory devices.
+- The version of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) is 2.26.0 or above.
+
+## Instructions
+
+### Step1: Prepare the checkpoint of the model and optimized matrix from [Spin Quant](https://github.com/facebookresearch/SpinQuant)
+
+1. For Llama 3 tokenizer and checkpoint, please refer to https://github.com/meta-llama/llama-models/blob/main/README.md for further instructions on how to download `tokenizer.model`, `consolidated.00.pth` and `params.json`.
+2. To get the optimized matrix, please refer to [SpinQuant on GitHub](https://github.com/facebookresearch/SpinQuant). You can download the optimized rotation matrices in the Quantized Models section. Please choose **LLaMA-3-8B/8B_W4A16KV16_lr_1.5_seed_0**.
+
+### Step2: Export to ExecuTorch with Qualcomm AI Engine Direct Backend
+Deploying large language models like Llama 3 on-device presents the following challenges:
+
+1. The model size is too large to fit in device memory for inference.
+2. High model loading and inference time.
+3. Difficulty in quantization.
+
+To address these challenges, we have implemented the following solutions:
+1. Using `--pt2e_quantize qnn_16a4w` to quantize activations and weights, thereby reducing the on-disk model size and alleviating memory pressure during inference.
+2. Using `--num_sharding 8` to shard the model into sub-parts.
+3. Performing graph transformations to convert or decompose operations into more accelerator-friendly operations.
+4. Using `--optimized_rotation_path <path_to_optimized_matrix>` to apply R1 and R2 of [Spin Quant](https://github.com/facebookresearch/SpinQuant) to improve accuracy.
+5. Using `--calibration_data "<|start_header_id|>system<|end_header_id|..."` to ensure that during the quantization of Llama 3 8B instruct, the calibration includes special tokens in the prompt template. For more details on the prompt template, refer to [the model card of meta llama3 instruct](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/).
+
+To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure the following:
+
+1. The host machine has more than 100GB of memory (RAM + swap space).
+2. The entire process takes a few hours.
+
+```bash
+# Please note that calibration_data must include the prompt template for special tokens.
+python -m examples.models.llama2.export_llama  -t <path_to_tokenizer.model>
+llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p <path_to_params.json> -c <path_to_checkpoint_for_Meta-Llama-3-8B-Instruct>  --use_kv_cache  --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path <path_to_optimized_matrix> --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+```
+
+### Step3: Invoke the Runtime on an Android smartphone with Qualcomm SoCs
+1. Build executorch with Qualcomm AI Engine Direct Backend for android
+    ```bash
+    cmake \
+        -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake" \
+        -DANDROID_ABI=arm64-v8a \
+        -DANDROID_PLATFORM=android-23 \
+        -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+        -DEXECUTORCH_BUILD_QNN=ON \
+        -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+        -Bcmake-android-out .
+
+    cmake --build cmake-android-out -j16 --target install --config Release
+    ```
+2. Build llama runner for android
+```bash
+    cmake \
+        -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}"/build/cmake/android.toolchain.cmake  \
+        -DANDROID_ABI=arm64-v8a \
+        -DANDROID_PLATFORM=android-23 \
+        -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+        -DCMAKE_BUILD_TYPE=Release -DPYTHON_EXECUTABLE=python \
+        -DEXECUTORCH_BUILD_QNN=ON \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+        -Bcmake-android-out/examples/models/llama2 examples/models/llama2
+
+    cmake --build cmake-android-out/examples/models/llama2 -j16 --config Release
+```
+3. Run on Android via adb shell
+*Pre-requisite*: Make sure you enable USB debugging via developer options on your phone
+
+**3.1 Connect your android phone**
+
+**3.2 We need to push required QNN libraries to the device.**
+```bash
+# make sure you have write-permission on below path.
+DEVICE_DIR=/data/local/tmp/llama
+adb shell mkdir -p ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR}
+```
+
+**3.3 Upload model, tokenizer and llama runner binary to phone**
+```bash
+adb push <model.pte> ${DEVICE_DIR}
+adb push <tokenizer.model> ${DEVICE_DIR}
+adb push cmake-android-out/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
+adb push cmake-out-android/examples/models/llama2/llama_main ${DEVICE_DIR}
+```
+
+**3.4 Run model**
+```bash
+adb shell "cd ${DEVICE_DIR} && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.model> --prompt \"<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n\" --seq_len 128"
+```
+You should see the message:
+```
+<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello! I'd be delighted to chat with you about Facebook. Facebook is a social media platform that was created in 2004 by Mark Zuckerberg and his colleagues while he was a student at Harvard University. It was initially called "Facemaker" but later changed to Facebook, which is a combination of the words "face" and "book". The platform was initially intended for people to share their thoughts and share information with their friends, but it quickly grew to become one of the
+```
+
+## What is coming?
+- Improve the performance for Llama 3 Instruct
+- Reduce the memory pressure during inference to support 12GB Qualcomm devices
+- Support more LLMs
+
+## FAQ
+
+If you encounter any issues while reproducing the tutorial, please file a github
+issue on ExecuTorch repo and tag use `#qcom_aisw` tag
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
index 5fffb7e8caf..771bf489a94 100644
--- a/docs/source/llm/getting-started.md
+++ b/docs/source/llm/getting-started.md
@@ -198,25 +198,21 @@ Create a file called main.cpp with the following contents:
 // main.cpp
 
 #include <cstdint>
-#include <functional>
-#include <memory>
-#include <unordered_map>
 
-#include "basic_tokenizer.h"
 #include "basic_sampler.h"
-#include "managed_tensor.h"
+#include "basic_tokenizer.h"
 
 #include <executorch/extension/module/module.h>
-#include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-
-using namespace torch::executor;
+#include <executorch/runtime/core/result.h>
 
-using SizesType = exec_aten::SizesType;
-using DimOrderType = exec_aten::DimOrderType;
-using StridesType = exec_aten::StridesType;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::extension::Module;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
 ```
 
 The model inputs and outputs take the form of tensors. A tensor can be thought of as an multi-dimensional array.
@@ -248,14 +244,13 @@ std::string generate(
     for (auto i = 0u; i < max_output_length; i++) {
         // Convert the input_tokens from a vector of int64_t to EValue.
         // EValue is a unified data type in the ExecuTorch runtime.
-        ManagedTensor tensor_tokens(
+        auto inputs = from_blob(
             input_tokens.data(),
             {1, static_cast<int>(input_tokens.size())},
             ScalarType::Long);
-        std::vector<EValue> inputs = {tensor_tokens.get_tensor()};
 
         // Run the model. It will return a tensor of logits (log-probabilities).
-        Result<std::vector<EValue>> logits_evalue = llm_model.forward(inputs);
+        auto logits_evalue = llm_model.forward(inputs);
 
         // Convert the output logits from EValue to std::vector, which is what
         // the sampler expects.
@@ -343,7 +338,6 @@ Finally, download the following files into the same directory as main.h:
 ```
 curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_sampler.h
 curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_tokenizer.h
-curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/managed_tensor.h
 ```
 
 To learn more, see the [Runtime APIs Tutorial](../extension-module.md).
@@ -368,6 +362,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
 
 # Include the executorch subdirectory.
@@ -381,6 +376,7 @@ target_link_libraries(
     PRIVATE
     executorch
     extension_module_static # Provides the Module class
+    extension_tensor # Provides the TensorPtr class
     optimized_native_cpu_ops_lib) # Provides baseline cross-platform kernels
 ```
 
@@ -390,7 +386,6 @@ At this point, the working directory should contain the following files:
 - main.cpp
 - basic_tokenizer.h
 - basic_sampler.h
-- managed_tensor.h
 - export_nanogpt.py
 - model.py
 - vocab.json
@@ -522,6 +517,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
 option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend
 
@@ -538,6 +534,7 @@ target_link_libraries(
     PRIVATE
     executorch
     extension_module_static # Provides the Module class
+    extension_tensor # Provides the TensorPtr class
     optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
     xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
 ```
@@ -552,7 +549,6 @@ At this point, the working directory should contain the following files:
 - main.cpp
 - basic_tokenizer.h
 - basic_sampler.h
-- managed_tensor.h
 - export_nanogpt.py
 - model.py
 - vocab.json
@@ -591,8 +587,8 @@ I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a ver
 The delegated model should be noticeably faster compared to the non-delegated model.
 
 For more information regarding backend delegateion, see the ExecuTorch guides
-for the [XNNPACK Backend](../tutorial-xnnpack-delegate-lowering.md) and [Core ML
-Backend](../build-run-coreml.md).
+for the [XNNPACK Backend](../tutorial-xnnpack-delegate-lowering.md),  [Core ML
+Backend](../build-run-coreml.md) and [Qualcomm AI Engine Direct Backend](build-run-llama3-qualcomm-ai-engine-direct-backend.md).
 
 ## Quantization
 
@@ -750,7 +746,7 @@ In the fragment of the output for nanoGPT below, observe that embedding and add
 
 ### Performance Analysis
 
-Through the ExecuTorch SDK, users are able to profile model execution, giving timing information for each operator in the model.
+Through the ExecuTorch Developer Tools, users are able to profile model execution, giving timing information for each operator in the model.
 
 #### Prerequisites
 
@@ -763,7 +759,7 @@ In your export script, after calling `to_edge()` and `to_executorch()`, call `ge
 
 ```
 import copy
-from executorch.sdk import generate_etrecord
+from executorch.devtools import generate_etrecord
 
 # Make the deep copy immediately after to to_edge()
 edge_manager_copy = copy.deepcopy(edge_manager)
@@ -784,7 +780,7 @@ Include the ETDump header in your code.
 ```cpp
 // main.cpp
 
-#include <executorch/sdk/etdump/etdump_flatcc.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 ```
 
 Create an Instance of the ETDumpGen class and pass it to the Module constructor.
@@ -809,10 +805,10 @@ if (result.buf != nullptr && result.size > 0) {
 }
 ```
 
-Additionally, update CMakeLists.txt to build with SDK and enable events to be traced and logged into ETDump:
+Additionally, update CMakeLists.txt to build with Developer Tools and enable events to be traced and logged into ETDump:
 
 ```
-option(EXECUTORCH_BUILD_SDK "" ON)
+option(EXECUTORCH_BUILD_DEVTOOLS "" ON)
 
 # ...
 
@@ -835,7 +831,7 @@ Run the runner, you will see “etdump.etdp” generated.
 Once you’ve collected debug artifacts ETDump (and optionally an ETRecord), you can use the Inspector API to view performance information.
 
 ```python
-from executorch.sdk import Inspector
+from executorch.devtools import Inspector
 
 inspector = Inspector(etdump_path="etdump.etdp")
 # If you also generated an ETRecord, then pass that in as well: `inspector = Inspector(etdump_path="etdump.etdp", etrecord="etrecord.bin")`
diff --git a/docs/source/native-delegates-executorch-xnnpack-delegate.md b/docs/source/native-delegates-executorch-xnnpack-delegate.md
index 1d12daef9d8..41de29687e3 100644
--- a/docs/source/native-delegates-executorch-xnnpack-delegate.md
+++ b/docs/source/native-delegates-executorch-xnnpack-delegate.md
@@ -74,7 +74,7 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre
 When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors.
 
 #### **Profiling**
-We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's SDK integration, you can also now use the SDK tools to profile the model. You can follow the steps in [Using the ExecuTorch SDK to Profile a Model](./tutorials/sdk-integration-tutorial) on how to profile ExecuTorch models and use SDK's Inspector API to view XNNPACK's internal profiling information.
+We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](./tutorials/devtools-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information.
 
 
 [comment]: <> (TODO: Refactor quantizer to a more official quantization doc)
@@ -110,9 +110,9 @@ quantizer.set_global(quantization_config)
 ### Quantizing your model with the XNNPACKQuantizer
 After configuring our quantizer, we are now ready to quantize our model
 ```python
-from torch._export import capture_pre_autograd_graph
+from torch.export import export_for_training
 
-exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs)
+exported_model = export_for_training(model_to_quantize, example_inputs).module()
 prepared_model = prepare_pt2e(exported_model, quantizer)
 print(prepared_model.graph)
 ```
diff --git a/docs/source/runtime-overview.md b/docs/source/runtime-overview.md
index 7bc8b4dd8b4..6766e678e0e 100644
--- a/docs/source/runtime-overview.md
+++ b/docs/source/runtime-overview.md
@@ -96,7 +96,7 @@ can build it for a wide variety of target systems.
 
 #### C++ Language Considerations
 
-* The code is C++11-compatible to work with older toolchains.
+* The code is C++17-compatible to work with older toolchains.
 * The runtime does not use exceptions or RTTI, although it is not antagonistic
   to them.
 * The code is compatible with GCC and Clang, and has also been built with
diff --git a/docs/source/sdk-bundled-io.md b/docs/source/sdk-bundled-io.md
index 33deae3904b..776c37a5da3 100644
--- a/docs/source/sdk-bundled-io.md
+++ b/docs/source/sdk-bundled-io.md
@@ -28,7 +28,7 @@ In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTest
 :::{dropdown} `MethodTestCase`
 
 ```{eval-rst}
-.. autofunction:: executorch.sdk.bundled_program.config.MethodTestCase.__init__
+.. autofunction:: executorch.devtools.bundled_program.config.MethodTestCase.__init__
     :noindex:
 ```
 :::
@@ -38,7 +38,7 @@ In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTest
 :::{dropdown} `MethodTestSuite`
 
 ```{eval-rst}
-.. autofunction:: executorch.sdk.bundled_program.config.MethodTestSuite
+.. autofunction:: executorch.devtools.bundled_program.config.MethodTestSuite
     :noindex:
 ```
 :::
@@ -48,13 +48,13 @@ Since each model may have multiple inference methods, we need to generate `List[
 
 ### Step 3: Generate `BundledProgram`
 
-We provide `BundledProgram` class under `executorch/sdk/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including
+We provide `BundledProgram` class under `executorch/devtools/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including
                             `ExecutorchProgram`, `MultiMethodExecutorchProgram` or `ExecutorchProgramManager`, with the `List[MethodTestSuite]`:
 
 :::{dropdown} `BundledProgram`
 
 ```{eval-rst}
-.. autofunction:: executorch.sdk.bundled_program.core.BundledProgram.__init__
+.. autofunction:: executorch.devtools.bundled_program.core.BundledProgram.__init__
     :noindex:
 ```
 :::
@@ -65,18 +65,18 @@ Construtor of `BundledProgram `will do sannity check internally to see if the gi
 
 ### Step 4: Serialize `BundledProgram` to Flatbuffer.
 
-To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs, both under `executorch/sdk/bundled_program/serialize/__init__.py`.
+To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs, both under `executorch/devtools/bundled_program/serialize/__init__.py`.
 
 :::{dropdown} Serialize and Deserialize
 
 ```{eval-rst}
-.. currentmodule:: executorch.sdk.bundled_program.serialize
+.. currentmodule:: executorch.devtools.bundled_program.serialize
 .. autofunction:: serialize_from_bundled_program_to_flatbuffer
     :noindex:
 ```
 
 ```{eval-rst}
-.. currentmodule:: executorch.sdk.bundled_program.serialize
+.. currentmodule:: executorch.devtools.bundled_program.serialize
 .. autofunction:: deserialize_from_flatbuffer_to_bundled_program
     :noindex:
 ```
@@ -90,14 +90,13 @@ Here is a flow highlighting how to generate a `BundledProgram` given a PyTorch m
 import torch
 
 from executorch.exir import to_edge
-from executorch.sdk import BundledProgram
+from executorch.devtools import BundledProgram
 
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
-from torch._export import capture_pre_autograd_graph
-from torch.export import export
+from torch.export import export, export_for_training
 
 
 # Step 1: ExecuTorch Program Export
@@ -131,7 +130,7 @@ capture_input = (
 
 # Export method's FX Graph.
 method_graph = export(
-    capture_pre_autograd_graph(model, capture_input),
+    export_for_training(model, capture_input).module(),
     capture_input,
 )
 
@@ -187,7 +186,7 @@ with open(save_path, "wb") as f:
 We can also regenerate `BundledProgram` from flatbuffer file if needed:
 
 ```python
-from executorch.sdk.bundled_program.serialize import deserialize_from_flatbuffer_to_bundled_program
+from executorch.devtools.bundled_program.serialize import deserialize_from_flatbuffer_to_bundled_program
 save_path = "bundled_program.bpte"
 with open(save_path, "rb") as f:
     serialized_bundled_program = f.read()
@@ -211,21 +210,19 @@ We need the pointer to ExecuTorch program to do the execution. To unify the proc
 
 Here's an example of how to use the `GetProgramData` API:
 ```c++
-std::shared_ptr<char> buff_ptr;
-size_t buff_len;
-
-// FILE_PATH here can be either BundledProgram or Program flatbuffer file.
-Error status = torch::executor::util::read_file_content(
-    FILE_PATH, &buff_ptr, &buff_len);
-ET_CHECK_MSG(
-    status == Error::Ok,
-    "read_file_content() failed with status 0x%" PRIx32,
-    status);
-
+// Assume that the user has read the contents of the file into file_data using
+// whatever method works best for their application. The file could contain
+// either BundledProgram data or Program data.
+void* file_data = ...;
+size_t file_data_len = ...;
+
+// If file_data contains a BundledProgram, GetProgramData() will return a
+// pointer to the Program data embedded inside it. Otherwise it will return
+// file_data, which already pointed to Program data.
 const void* program_ptr;
 size_t program_len;
 status = torch::executor::bundled_program::GetProgramData(
-    buff_ptr.get(), buff_len, &program_ptr, &program_len);
+    file_data, file_data_len, &program_ptr, &program_len);
 ET_CHECK_MSG(
     status == Error::Ok,
     "GetProgramData() failed with status 0x%" PRIx32,
@@ -255,7 +252,7 @@ We call `torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput
 
 ### Runtime Example
 
-Here we provide an example about how to run the bundled program step by step. Most of the code is borrowed from [executor_runner](https://github.com/pytorch/executorch/blob/main/examples/sdk/sdk_example_runner/sdk_example_runner.cpp), and please review that file if you need more info and context:
+Here we provide an example about how to run the bundled program step by step. Most of the code is borrowed from [executor_runner](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp), and please review that file if you need more info and context:
 
 ```c++
 // method_name is the name for the method we want to test
@@ -313,9 +310,9 @@ Here's the example of the dtype of test input not meet model's requirement:
 import torch
 
 from executorch.exir import to_edge
-from executorch.sdk import BundledProgram
+from executorch.devtools import BundledProgram
 
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
 from torch.export import export
 
 
@@ -340,7 +337,7 @@ inputs = (torch.ones(2, 2, dtype=torch.float), )
 
 # Find each method of model needs to be traced my its name, export its FX Graph.
 method_graph = export(
-    capture_pre_autograd_graph(model, inputs),
+    export_for_training(model, inputs).module(),
     inputs,
 )
 
@@ -400,7 +397,7 @@ Cell In[1], line 72
      68 ]
      70 # Step 3: Generate BundledProgram
 ---> 72 bundled_program = create_bundled_program(program, method_test_suites)
-File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites)
+File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites)
     264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together.
     265
     266 Args:
@@ -411,7 +408,7 @@ File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(prog
 --> 276 assert_valid_bundle(program, method_test_suites)
     278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = []
     280 # Emit data and metadata of bundled tensor
-File /executorch/sdk/bundled_program/core.py:219, in assert_valid_bundle(program, method_test_suites)
+File /executorch/devtools/bundled_program/core.py:219, in assert_valid_bundle(program, method_test_suites)
     215 # type of tensor input should match execution plan
     216 if type(cur_plan_test_inputs[j]) == torch.Tensor:
     217     # pyre-fixme[16]: Undefined attribute [16]: Item `bool` of `typing.Union[bool, float, int, torch._tensor.Tensor]`
@@ -449,9 +446,9 @@ Another common error would be the method name in any `MethodTestSuite` does not
 import torch
 
 from executorch.exir import to_edge
-from executorch.sdk import BundledProgram
+from executorch.devtools import BundledProgram
 
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
 from torch.export import export
 
 
@@ -476,7 +473,7 @@ inputs = (torch.ones(2, 2, dtype=torch.float),)
 
 # Find each method of model needs to be traced my its name, export its FX Graph.
 method_graph = export(
-    capture_pre_autograd_graph(model, inputs),
+    export_for_training(model, inputs).module(),
     inputs,
 )
 
@@ -532,7 +529,7 @@ Cell In[3], line 73
      70 method_test_suites[0].method_name = "MISSING_METHOD_NAME"
      72 # Generate BundledProgram
 ---> 73 bundled_program = create_bundled_program(program, method_test_suites)
-File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites)
+File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites)
     264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together.
     265
     266 Args:
@@ -543,7 +540,7 @@ File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(prog
 --> 276 assert_valid_bundle(program, method_test_suites)
     278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = []
     280 # Emit data and metadata of bundled tensor
-File /executorch/sdk/bundled_program/core.py:141, in assert_valid_bundle(program, method_test_suites)
+File /executorch/devtools/bundled_program/core.py:141, in assert_valid_bundle(program, method_test_suites)
     138 method_name_of_program = {e.name for e in program.execution_plan}
     139 method_name_of_test_suites = {t.method_name for t in method_test_suites}
 --> 141 assert method_name_of_test_suites.issubset(
diff --git a/docs/source/sdk-debugging.md b/docs/source/sdk-debugging.md
index 45e50b44e87..4707b4a2f99 100644
--- a/docs/source/sdk-debugging.md
+++ b/docs/source/sdk-debugging.md
@@ -1,6 +1,6 @@
 # Debugging Models in ExecuTorch
 
-With the ExecuTorch SDK, users can debug their models for numerical inaccurcies and extract model outputs from their device to do quality analysis (such as Signal-to-Noise, Mean square error etc.).
+With the ExecuTorch Developer Tools, users can debug their models for numerical inaccurcies and extract model outputs from their device to do quality analysis (such as Signal-to-Noise, Mean square error etc.).
 
 Currently, ExecuTorch supports the following debugging flows:
 - Extraction of model level outputs via ETDump.
@@ -11,7 +11,7 @@ Currently, ExecuTorch supports the following debugging flows:
 ## Steps to debug a model in ExecuTorch
 
 ### Runtime
-For a real example reflecting the steps below, please refer to [sdk_example_runner.cpp](https://github.com/pytorch/executorch/blob/main/examples/sdk/sdk_example_runner/sdk_example_runner.cpp).
+For a real example reflecting the steps below, please refer to [example_runner.cpp](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp).
 
 1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) while exporting your model. When provided, this enables users to link profiling information back to the eager model source code (with stack traces and module hierarchy).
 2. Integrate [ETDump generation](./sdk-etdump.md) into the runtime and set the debugging level by configuring the `ETDumpGen` object. Then, provide an additional buffer to which intermediate outputs and program outputs will be written. Currently we support two levels of debugging:
@@ -38,7 +38,7 @@ For a real example reflecting the steps below, please refer to [sdk_example_runn
 Once a model has been run, using the generated ETDump and debug buffers, users can leverage the [Inspector API's](./sdk-inspector.rst) to inspect these debug outputs.
 
 ```python
-from executorch.sdk import Inspector
+from executorch.devtools import Inspector
 
 # Create an Inspector instance with etdump and the debug buffer.
 inspector = Inspector(etdump_path=etdump_path,
@@ -67,7 +67,7 @@ We've also provided a simple set of utilities that let users perform quality ana
 
 
 ```python
-from executorch.sdk.inspector._inspector_utils import compare_results
+from executorch.devtools.inspector import compare_results
 
 # Run a simple quality analysis between the model outputs sourced from the
 # runtime and a set of reference outputs.
diff --git a/docs/source/sdk-delegate-integration.md b/docs/source/sdk-delegate-integration.md
index 80033711552..a2f67157c89 100644
--- a/docs/source/sdk-delegate-integration.md
+++ b/docs/source/sdk-delegate-integration.md
@@ -1,4 +1,4 @@
-# SDK Delegate Integration
+# Developer Tools Delegate Integration
 
 [Delegate backends](compiler-delegate-and-partitioner.md) are a prominent component of on-device models due to their flexibility in defining behavior. A side effect of this flexibility is that it operates as an opaque transformation. This obfuscates rich associations and mutations that are valuable in post-processing.
 - For example, if two different operator fusions were to occur within a delegate, post processing wouldn’t be able to separate the two transformations.
diff --git a/docs/source/sdk-etdump.md b/docs/source/sdk-etdump.md
index 4eacb18b14c..c58efb40de7 100644
--- a/docs/source/sdk-etdump.md
+++ b/docs/source/sdk-etdump.md
@@ -1,6 +1,6 @@
 # Prerequisite | ETDump - ExecuTorch Dump
 
-ETDump (ExecuTorch Dump) is one of the core components of the ExecuTorch SDK experience. It is the mechanism through which all forms of profiling and debugging data is extracted from the runtime. Users can't parse ETDump directly; instead, they should pass it into the Inspector API, which deserializes the data, offering interfaces for flexible analysis and debugging.
+ETDump (ExecuTorch Dump) is one of the core components of the ExecuTorch Developer Tools. It is the mechanism through which all forms of profiling and debugging data is extracted from the runtime. Users can't parse ETDump directly; instead, they should pass it into the Inspector API, which deserializes the data, offering interfaces for flexible analysis and debugging.
 
 
 ## Generating an ETDump
@@ -9,7 +9,7 @@ Generating an ETDump is a relatively straightforward process. Users can follow t
 
 1. ***Include*** the ETDump header in your code.
 ```C++
-#include <executorch/sdk/etdump/etdump_flatcc.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 ```
 
 2. ***Create*** an Instance of the ETDumpGen class and pass it into the `load_method` call that is invoked in the runtime.
diff --git a/docs/source/sdk-etrecord.rst b/docs/source/sdk-etrecord.rst
index 43ed5095c64..63546f43ca6 100644
--- a/docs/source/sdk-etrecord.rst
+++ b/docs/source/sdk-etrecord.rst
@@ -9,7 +9,7 @@ users ahead of time (when they export their model to run on ExecuTorch).
 To draw a rough equivalent to conventional software development,
 ``ETRecord`` can be considered as the binary built with debug symbols
 that is used for debugging in GNU Debugger (gdb). It is expected that
-the user will supply this to the ExecuTorch SDK tooling in order for
+the user will supply this to the ExecuTorch Developer Tools in order for
 them to debug and visualize their model.
 
 ``ETRecord`` contains numerous components such as:
@@ -31,7 +31,7 @@ they are interested in working with via our tooling.
 .. warning::
     Users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process.
 
-.. currentmodule:: executorch.sdk.etrecord._etrecord
+.. currentmodule:: executorch.devtools.etrecord._etrecord
 .. autofunction:: generate_etrecord
 
 Using an ``ETRecord``
diff --git a/docs/source/sdk-inspector.rst b/docs/source/sdk-inspector.rst
index e15c1f2a395..4f55271b3fe 100644
--- a/docs/source/sdk-inspector.rst
+++ b/docs/source/sdk-inspector.rst
@@ -17,7 +17,7 @@ APIs:
 * By accessing the `public attributes <#inspector-attributes>`__ of the ``Inspector``, ``EventBlock``, and ``Event`` classes.
 * By using a `CLI <#cli>`__ tool for basic functionalities.
 
-Please refer to the `e2e use case doc <tutorials/sdk-integration-tutorial.html>`__ get an understanding of how to use these in a real world example.
+Please refer to the `e2e use case doc <tutorials/devtools-integration-tutorial.html>`__ get an understanding of how to use these in a real world example.
 
 
 Inspector Methods
@@ -26,26 +26,26 @@ Inspector Methods
 Constructor
 ~~~~~~~~~~~
 
-.. autofunction:: executorch.sdk.Inspector.__init__
+.. autofunction:: executorch.devtools.Inspector.__init__
 
 **Example Usage:**
 
 .. code:: python
 
-    from executorch.sdk import Inspector
+    from executorch.devtools import Inspector
 
     inspector = Inspector(etdump_path="/path/to/etdump.etdp", etrecord="/path/to/etrecord.bin")
 
 to_dataframe
 ~~~~~~~~~~~~~~~~
 
-.. autofunction:: executorch.sdk.Inspector.to_dataframe
+.. autofunction:: executorch.devtools.Inspector.to_dataframe
 
 
 print_data_tabular
 ~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: executorch.sdk.Inspector.print_data_tabular
+.. autofunction:: executorch.devtools.Inspector.print_data_tabular
 
 .. _example-usage-1:
 
@@ -62,7 +62,7 @@ Note that the unit of delegate profiling events is "cycles". We're working on pr
 find_total_for_module
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: executorch.sdk.Inspector.find_total_for_module
+.. autofunction:: executorch.devtools.Inspector.find_total_for_module
 
 .. _example-usage-2:
 
@@ -80,7 +80,7 @@ find_total_for_module
 get_exported_program
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: executorch.sdk.Inspector.get_exported_program
+.. autofunction:: executorch.devtools.Inspector.get_exported_program
 
 .. _example-usage-3:
 
@@ -119,7 +119,7 @@ of an ``Inspector`` instance, for example:
 
     inspector.event_blocks
 
-.. autoclass:: executorch.sdk.inspector.EventBlock
+.. autoclass:: executorch.devtools.inspector.EventBlock
 
 ``Event`` Class
 ~~~~~~~~~~~~~~~
@@ -127,7 +127,7 @@ of an ``Inspector`` instance, for example:
 Access ``Event`` instances through the ``events`` attribute of an
 ``EventBlock`` instance.
 
-.. autoclass:: executorch.sdk.inspector.Event
+.. autoclass:: executorch.devtools.inspector.Event
 
 **Example Usage:**
 
@@ -152,7 +152,7 @@ table. This command produces the identical table output as calling the
 
 .. code:: bash
 
-    python3 -m sdk.inspector.inspector_cli --etdump_path <path_to_etdump> --etrecord_path <path_to_etrecord>
+    python3 -m devtools.inspector.inspector_cli --etdump_path <path_to_etdump> --etrecord_path <path_to_etrecord>
 
 Note that the `etrecord_path` argument is optional.
 
diff --git a/docs/source/sdk-overview.md b/docs/source/sdk-overview.md
index 53f7d88613a..1e8f1fae1ba 100644
--- a/docs/source/sdk-overview.md
+++ b/docs/source/sdk-overview.md
@@ -1,44 +1,3 @@
-# Introduction to the ExecuTorch SDK
+# Introduction to the ExecuTorch Developer Tools
 
-ExecuTorch has been designed with [productivity](./intro-overview.md) as one of its core objectives and the ExecuTorch SDK enables this through the comprehensive suite of tools it provides users to help them profile, debug, and visualize models that they have onboarded onto ExecuTorch.
-
-All the components of the SDK have been designed from the ground up with deep integration in both the export process and the runtime. This enables us to provide unique features such as linking back operator execution in the runtime to the line of code in the original eager model that this operator originated from.
-
-## SDK Features
-
-The ExecuTorch SDK supports the following features:
-
-- **BundledProgram** is a utility tool for exporting the model bundled with a sample set of (representative) inputs and expected outputs, so that during runtime users can validate that the actual output is in fact the same as the expected output.
-- **Profiling** models with operator level breakdown of performance stats
-    - Linking back operator performance stats to source code and module hierarchy
-    - Model loading and execution time
-- **Delegate Integration** - Surfacing performance details from delegate backends
-    - Link back delegate operator execution to the nodes they represent in the edge dialect graph (and subsequently linking back to source code and module hierarchy)
-- **Debugging** - Intermediate outputs and output quality analysis
-- **Visualization** - Coming soon
-
-## Fundamental components of the SDK
-
-In order to fully understand and leverage the power of the SDK in this section, the fundamental components that power the SDK will be detailed.
-
-### ETRecord
-ETRecord (ExecuTorch Record) is an artifact generated during the export process that stores the graphs and other metadata that is critical for the SDK tooling to be able to link back the performance/debug data sourced from the runtime to the source code of the eager model.
-
-To draw a rough equivalence to conventional software development ETRecord can be considered as the binary built with debug symbols that is used for debugging in GNU Project debugger (gdb).
-
-More details are available in the [ETRecord documentation](sdk-etrecord.rst) on how to generate and store an ETRecord.
-
-### ETDump
-ETDump (ExecuTorch Dump) is the binary blob that is generated by the runtime after running a model. Similarly as above, to draw a rough equivalence to conventional software development, ETDump can be considered as the coredump of ExecuTorch, but in this case within ETDump we store all the performance and debug data that was generated by the runtime during model execution.
-
-```{note}
-If you only care about looking at the raw performance data without linking back to source code and other extensive features, an ETDump alone will be enough to leverage the basic features of the SDK. For the full experience, it is recommended that the users also generate an ETRecord.
-```
-
-More details are available in the [ETDump documentation](sdk-etdump.md) on how to generate and store an ETDump from the runtime.
-
-
-### Inspector APIs
-The Inspector Python APIs are the main user enrty point into the SDK. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API.
-
-More details are available in the [Inspector API documentation](sdk-inspector.rst) on how to use the Inspector APIs.
+Please update your link to <https://pytorch.org/executorch/main/devtools-overview.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-profiling.md b/docs/source/sdk-profiling.md
index 83276d8d180..e17fb1ae48e 100644
--- a/docs/source/sdk-profiling.md
+++ b/docs/source/sdk-profiling.md
@@ -4,7 +4,7 @@ Profiling in ExecuTorch gives users access to these runtime metrics:
 - Model Load Time.
 - Operator Level Execution Time.
 - Delegate Execution Time.
-  - If the delegate that the user is calling into has been integrated with the [SDK](./sdk-delegate-integration.md), then users will also be able to access delegated operator execution time.
+  - If the delegate that the user is calling into has been integrated with the [Developer Tools](./sdk-delegate-integration.md), then users will also be able to access delegated operator execution time.
 - End-to-end Inference Execution Time.
 
 One uniqe aspect of ExecuTorch Profiling is the ability to link every runtime executed operator back to the exact line of python code from which this operator originated. This capability enables users to easily identify hotspots in their model, source them back to the exact line of Python code, and optimize if chosen to.
@@ -20,4 +20,4 @@ We provide access to all the profiling data via the Python [Inspector API](./sdk
     - Through the Inspector API, users can do a wide range of analysis varying from printing out performance details to doing more finer granular calculation on module level.
 
 
-Please refer to the [SDK tutorial](./tutorials/sdk-integration-tutorial.rst) for a step-by-step walkthrough of the above process on a sample model.
+Please refer to the [Developer Tools tutorial](./tutorials/devtools-integration-tutorial.rst) for a step-by-step walkthrough of the above process on a sample model.
diff --git a/docs/source/sdk-tutorial.md b/docs/source/sdk-tutorial.md
index 90c9ed6d343..457d3b47ebf 100644
--- a/docs/source/sdk-tutorial.md
+++ b/docs/source/sdk-tutorial.md
@@ -1,3 +1,3 @@
-## SDK usage tutorial
+## Developer Tools Usage Tutorial
 
-Please refer to the [SDK tutorial](./tutorials/sdk-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the SDK.
+Please update your link to <https://pytorch.org/executorch/main/devtools-tutorial.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md
index 4491a6e8c80..666ee23aa35 100644
--- a/docs/source/tutorial-xnnpack-delegate-lowering.md
+++ b/docs/source/tutorial-xnnpack-delegate-lowering.md
@@ -74,13 +74,13 @@ After lowering to the XNNPACK Program, we can then prepare it for executorch and
 The XNNPACK delegate can also execute symmetrically quantized models. To understand the quantization flow and learn how to quantize models, refer to [Custom Quantization](quantization-custom-quantization.md) note. For the sake of this tutorial, we will leverage the `quantize()` python helper function conveniently added to the `executorch/executorch/examples` folder.
 
 ```python
-from torch._export import capture_pre_autograd_graph
+from torch.export import export_for_training
 from executorch.exir import EdgeCompileConfig
 
 mobilenet_v2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
 sample_inputs = (torch.randn(1, 3, 224, 224), )
 
-mobilenet_v2 = capture_pre_autograd_graph(mobilenet_v2, sample_inputs) # 2-stage export for quantization path
+mobilenet_v2 = export_for_training(mobilenet_v2, sample_inputs).module() # 2-stage export for quantization path
 
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer.xnnpack_quantizer import (
@@ -107,7 +107,7 @@ def quantize(model, example_inputs):
 quantized_mobilenetv2 = quantize(mobilenet_v2, sample_inputs)
 ```
 
-Quantization requires a two stage export. First we use the `capture_pre_autograd_graph` API to capture the model before giving it to `quantize` utility function. After performing the quantization step, we can now leverage the XNNPACK delegate to lower the quantized exported model graph. From here, the procedure is the same as for the non-quantized model lowering to XNNPACK.
+Quantization requires a two stage export. First we use the `export_for_training` API to capture the model before giving it to `quantize` utility function. After performing the quantization step, we can now leverage the XNNPACK delegate to lower the quantized exported model graph. From here, the procedure is the same as for the non-quantized model lowering to XNNPACK.
 
 ```python
 # Continued from earlier...
@@ -149,9 +149,10 @@ mkdir cmake-out
 cmake \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DPYTHON_EXECUTABLE=python \
     -Bcmake-out .
diff --git a/docs/source/tutorials_source/devtools-integration-tutorial.py b/docs/source/tutorials_source/devtools-integration-tutorial.py
new file mode 100644
index 00000000000..b5e335b43d1
--- /dev/null
+++ b/docs/source/tutorials_source/devtools-integration-tutorial.py
@@ -0,0 +1,301 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Using the ExecuTorch Developer Tools to Profile a Model
+========================
+
+**Author:** `Jack Khuu <https://github.com/Jack-Khuu>`__
+"""
+
+######################################################################
+# The `ExecuTorch Developer Tools <../devtools-overview.html>`__ is a set of tools designed to
+# provide users with the ability to profile, debug, and visualize ExecuTorch
+# models.
+#
+# This tutorial will show a full end-to-end flow of how to utilize the Developer Tools to profile a model.
+# Specifically, it will:
+#
+# 1. Generate the artifacts consumed by the Developer Tools (`ETRecord <../sdk-etrecord.html>`__, `ETDump <../sdk-etdump.html>`__).
+# 2. Create an Inspector class consuming these artifacts.
+# 3. Utilize the Inspector class to analyze the model profiling result.
+
+######################################################################
+# Prerequisites
+# -------------
+#
+# To run this tutorial, you’ll first need to
+# `Set up your ExecuTorch environment <../getting-started-setup.html>`__.
+#
+
+######################################################################
+# Generate ETRecord (Optional)
+# ----------------------------
+#
+# The first step is to generate an ``ETRecord``. ``ETRecord`` contains model
+# graphs and metadata for linking runtime results (such as profiling) to
+# the eager model. This is generated via ``executorch.devtools.generate_etrecord``.
+#
+# ``executorch.devtools.generate_etrecord`` takes in an output file path (str), the
+# edge dialect model (``EdgeProgramManager``), the ExecuTorch dialect model
+# (``ExecutorchProgramManager``), and an optional dictionary of additional models.
+#
+# In this tutorial, an example model (shown below) is used to demonstrate.
+
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from executorch.devtools import generate_etrecord
+
+from executorch.exir import (
+    EdgeCompileConfig,
+    EdgeProgramManager,
+    ExecutorchProgramManager,
+    to_edge,
+)
+from torch.export import export, ExportedProgram
+
+
+# Generate Model
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        # 1 input image channel, 6 output channels, 5x5 square convolution
+        # kernel
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        # an affine operation: y = Wx + b
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        # Max pooling over a (2, 2) window
+        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
+        # If the size is a square, you can specify with a single number
+        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
+        x = torch.flatten(x, 1)  # flatten all dimensions except the batch dimension
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+model = Net()
+
+aten_model: ExportedProgram = export(
+    model,
+    (torch.randn(1, 1, 32, 32),),
+)
+
+edge_program_manager: EdgeProgramManager = to_edge(
+    aten_model, compile_config=EdgeCompileConfig(_check_ir_validity=True)
+)
+edge_program_manager_copy = copy.deepcopy(edge_program_manager)
+et_program_manager: ExecutorchProgramManager = edge_program_manager.to_executorch()
+
+
+# Generate ETRecord
+etrecord_path = "etrecord.bin"
+generate_etrecord(etrecord_path, edge_program_manager_copy, et_program_manager)
+
+# sphinx_gallery_start_ignore
+from unittest.mock import patch
+
+# sphinx_gallery_end_ignore
+
+######################################################################
+#
+# .. warning::
+#    Users should do a deepcopy of the output of ``to_edge()`` and pass in the
+#    deepcopy to the ``generate_etrecord`` API. This is needed because the
+#    subsequent call, ``to_executorch()``, does an in-place mutation and will
+#    lose debug data in the process.
+#
+
+######################################################################
+# Generate ETDump
+# ---------------
+#
+# Next step is to generate an ``ETDump``. ``ETDump`` contains runtime results
+# from executing a `Bundled Program Model <../sdk-bundled-io.html>`__.
+#
+# In this tutorial, a `Bundled Program` is created from the example model above.
+
+import torch
+from executorch.devtools import BundledProgram
+
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
+
+from executorch.exir import to_edge
+from torch.export import export
+
+# Step 1: ExecuTorch Program Export
+m_name = "forward"
+method_graphs = {m_name: export(model, (torch.randn(1, 1, 32, 32),))}
+
+# Step 2: Construct Method Test Suites
+inputs = [[torch.randn(1, 1, 32, 32)] for _ in range(2)]
+
+method_test_suites = [
+    MethodTestSuite(
+        method_name=m_name,
+        test_cases=[
+            MethodTestCase(inputs=inp, expected_outputs=getattr(model, m_name)(*inp))
+            for inp in inputs
+        ],
+    )
+]
+
+# Step 3: Generate BundledProgram
+executorch_program = to_edge(method_graphs).to_executorch()
+bundled_program = BundledProgram(executorch_program, method_test_suites)
+
+# Step 4: Serialize BundledProgram to flatbuffer.
+serialized_bundled_program = serialize_from_bundled_program_to_flatbuffer(
+    bundled_program
+)
+save_path = "bundled_program.bp"
+with open(save_path, "wb") as f:
+    f.write(serialized_bundled_program)
+
+######################################################################
+# Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake) to execute the Bundled Program to generate the ``ETDump``::
+#
+#       cd executorch
+#       ./examples/devtools/build_example_runner.sh
+#       cmake-out/examples/devtools/example_runner --bundled_program_path="bundled_program.bp"
+
+######################################################################
+# Creating an Inspector
+# ---------------------
+#
+# Final step is to create the ``Inspector`` by passing in the artifact paths.
+# Inspector takes the runtime results from ``ETDump`` and correlates them to
+# the operators of the Edge Dialect Graph.
+#
+# Recall: An ``ETRecord`` is not required. If an ``ETRecord`` is not provided,
+# the Inspector will show runtime results without operator correlation.
+#
+# To visualize all runtime events, call Inspector's ``print_data_tabular``.
+
+from executorch.devtools import Inspector
+
+# sphinx_gallery_start_ignore
+inspector_patch = patch.object(Inspector, "__init__", return_value=None)
+inspector_patch_print = patch.object(Inspector, "print_data_tabular", return_value="")
+inspector_patch.start()
+inspector_patch_print.start()
+# sphinx_gallery_end_ignore
+etrecord_path = "etrecord.bin"
+etdump_path = "etdump.etdp"
+inspector = Inspector(etdump_path=etdump_path, etrecord=etrecord_path)
+# sphinx_gallery_start_ignore
+inspector.event_blocks = []
+# sphinx_gallery_end_ignore
+inspector.print_data_tabular()
+
+# sphinx_gallery_start_ignore
+inspector_patch.stop()
+inspector_patch_print.stop()
+# sphinx_gallery_end_ignore
+
+######################################################################
+# Analyzing with an Inspector
+# ---------------------------
+#
+# ``Inspector`` provides 2 ways of accessing ingested information: `EventBlocks <../sdk-inspector#eventblock-class>`__
+# and ``DataFrames``. These mediums give users the ability to perform custom
+# analysis about their model performance.
+#
+# Below are examples usages, with both ``EventBlock`` and ``DataFrame`` approaches.
+
+# Set Up
+import pprint as pp
+
+import pandas as pd
+
+pd.set_option("display.max_colwidth", None)
+pd.set_option("display.max_columns", None)
+
+######################################################################
+# If a user wants the raw profiling results, they would do something similar to
+# finding the raw runtime data of an ``addmm.out`` event.
+
+for event_block in inspector.event_blocks:
+    # Via EventBlocks
+    for event in event_block.events:
+        if event.name == "native_call_addmm.out":
+            print(event.name, event.perf_data.raw)
+
+    # Via Dataframe
+    df = event_block.to_dataframe()
+    df = df[df.event_name == "native_call_addmm.out"]
+    print(df[["event_name", "raw"]])
+    print()
+
+######################################################################
+# If a user wants to trace an operator back to their model code, they would do
+# something similar to finding the module hierarchy and stack trace of the
+# slowest ``convolution.out`` call.
+
+for event_block in inspector.event_blocks:
+    # Via EventBlocks
+    slowest = None
+    for event in event_block.events:
+        if event.name == "native_call_convolution.out":
+            if slowest is None or event.perf_data.p50 > slowest.perf_data.p50:
+                slowest = event
+    if slowest is not None:
+        print(slowest.name)
+        print()
+        pp.pprint(slowest.stack_traces)
+        print()
+        pp.pprint(slowest.module_hierarchy)
+
+    # Via Dataframe
+    df = event_block.to_dataframe()
+    df = df[df.event_name == "native_call_convolution.out"]
+    if len(df) > 0:
+        slowest = df.loc[df["p50"].idxmax()]
+        print(slowest.event_name)
+        print()
+        pp.pprint(slowest.stack_traces)
+        print()
+        pp.pprint(slowest.module_hierarchy)
+
+######################################################################
+# If a user wants the total runtime of a module, they can use
+# ``find_total_for_module``.
+
+print(inspector.find_total_for_module("L__self__"))
+print(inspector.find_total_for_module("L__self___conv2"))
+
+######################################################################
+# Note: ``find_total_for_module`` is a special first class method of
+# `Inspector <../sdk-inspector.html>`__
+
+######################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we learned about the steps required to consume an ExecuTorch
+# model with the ExecuTorch Developer Tools. It also showed how to use the Inspector APIs
+# to analyze the model run results.
+#
+# Links Mentioned
+# ^^^^^^^^^^^^^^^
+#
+# - `ExecuTorch Developer Tools Overview <../devtools-overview.html>`__
+# - `ETRecord <../sdk-etrecord.html>`__
+# - `ETDump <../sdk-etdump.html>`__
+# - `Inspector <../sdk-inspector.html>`__
diff --git a/docs/source/tutorials_source/export-to-executorch-tutorial.py b/docs/source/tutorials_source/export-to-executorch-tutorial.py
index 2071567ddd1..fac3eab08e5 100644
--- a/docs/source/tutorials_source/export-to-executorch-tutorial.py
+++ b/docs/source/tutorials_source/export-to-executorch-tutorial.py
@@ -179,8 +179,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 # -----------------------
 #
 # To quantize a model, we first need to capture the graph with
-# ``torch._export.capture_pre_autograd_graph``, perform quantization, and then
-# call ``torch.export``. ``torch._export.capture_pre_autograd_graph`` returns a
+# ``torch.export.export_for_training``, perform quantization, and then
+# call ``torch.export``. ``torch.export.export_for_training`` returns a
 # graph which contains ATen operators which are Autograd safe, meaning they are
 # safe for eager-mode training, which is needed for quantization. We will call
 # the graph at this level, the ``Pre-Autograd ATen Dialect`` graph.
@@ -193,10 +193,10 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 # will annotate the nodes in the graph with information needed to quantize the
 # model properly for a specific backend.
 
-from torch._export import capture_pre_autograd_graph
+from torch.export import export_for_training
 
 example_args = (torch.randn(1, 3, 256, 256),)
-pre_autograd_aten_dialect = capture_pre_autograd_graph(SimpleConv(), example_args)
+pre_autograd_aten_dialect = export_for_training(SimpleConv(), example_args).module()
 print("Pre-Autograd ATen Dialect Graph")
 print(pre_autograd_aten_dialect)
 
@@ -523,9 +523,7 @@ def forward(self, a, x, b):
 executorch_program: ExecutorchProgramManager = edge_program.to_executorch(
     ExecutorchBackendConfig(
         passes=[],  # User-defined passes
-        memory_planning_pass=MemoryPlanningPass(
-            "greedy"
-        ),  # Default memory planning pass
+        memory_planning_pass=MemoryPlanningPass(),  # Default memory planning pass
     )
 )
 
@@ -562,8 +560,7 @@ def forward(self, a, x, b):
 # Here is an example for an entire end-to-end workflow:
 
 import torch
-from torch._export import capture_pre_autograd_graph
-from torch.export import export, ExportedProgram
+from torch.export import export, export_for_training, ExportedProgram
 
 
 class M(torch.nn.Module):
@@ -577,7 +574,7 @@ def forward(self, x):
 
 
 example_args = (torch.randn(3, 4),)
-pre_autograd_aten_dialect = capture_pre_autograd_graph(M(), example_args)
+pre_autograd_aten_dialect = export_for_training(M(), example_args).module()
 # Optionally do quantization:
 # pre_autograd_aten_dialect = convert_pt2e(prepare_pt2e(pre_autograd_aten_dialect, CustomBackendQuantizer))
 aten_dialect: ExportedProgram = export(pre_autograd_aten_dialect, example_args)
diff --git a/docs/source/tutorials_source/sdk-integration-tutorial.py b/docs/source/tutorials_source/sdk-integration-tutorial.py
index ccc2e480ad0..b9a8009c646 100644
--- a/docs/source/tutorials_source/sdk-integration-tutorial.py
+++ b/docs/source/tutorials_source/sdk-integration-tutorial.py
@@ -6,295 +6,8 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-Using the ExecuTorch SDK to Profile a Model
+Using the ExecuTorch Developer Tools to Profile a Model
 ========================
 
-**Author:** `Jack Khuu <https://github.com/Jack-Khuu>`__
+Please update your link to <https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial.html>. This URL will be deleted after v0.4.0.
 """
-
-######################################################################
-# The `ExecuTorch SDK <../sdk-overview.html>`__ is a set of tools designed to
-# provide users with the ability to profile, debug, and visualize ExecuTorch
-# models.
-#
-# This tutorial will show a full end-to-end flow of how to utilize the SDK.
-# Specifically, it will:
-#
-# 1. Generate the artifacts consumed by the SDK (`ETRecord <../sdk-etrecord.html>`__, `ETDump <../sdk-etdump.html>`__).
-# 2. Create an Inspector class consuming these artifacts.
-# 3. Utilize the Inspector class to analyze the model.
-
-######################################################################
-# Prerequisites
-# -------------
-#
-# To run this tutorial, you’ll first need to
-# `Set up your ExecuTorch environment <../getting-started-setup.html>`__.
-#
-
-######################################################################
-# Generate ETRecord (Optional)
-# ----------------------------
-#
-# The first step is to generate an ``ETRecord``. ``ETRecord`` contains model
-# graphs and metadata for linking runtime results (such as profiling) to
-# the eager model. This is generated via ``executorch.sdk.generate_etrecord``.
-#
-# ``executorch.sdk.generate_etrecord`` takes in an output file path (str), the
-# edge dialect model (``EdgeProgramManager``), the ExecuTorch dialect model
-# (``ExecutorchProgramManager``), and an optional dictionary of additional models.
-#
-# In this tutorial, an example model (shown below) is used to demonstrate.
-
-import copy
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from executorch.exir import (
-    EdgeCompileConfig,
-    EdgeProgramManager,
-    ExecutorchProgramManager,
-    to_edge,
-)
-from executorch.sdk import generate_etrecord
-from torch.export import export, ExportedProgram
-
-
-# Generate Model
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        # 1 input image channel, 6 output channels, 5x5 square convolution
-        # kernel
-        self.conv1 = nn.Conv2d(1, 6, 5)
-        self.conv2 = nn.Conv2d(6, 16, 5)
-        # an affine operation: y = Wx + b
-        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
-        self.fc2 = nn.Linear(120, 84)
-        self.fc3 = nn.Linear(84, 10)
-
-    def forward(self, x):
-        # Max pooling over a (2, 2) window
-        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
-        # If the size is a square, you can specify with a single number
-        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
-        x = torch.flatten(x, 1)  # flatten all dimensions except the batch dimension
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        x = self.fc3(x)
-        return x
-
-
-model = Net()
-
-aten_model: ExportedProgram = export(
-    model,
-    (torch.randn(1, 1, 32, 32),),
-)
-
-edge_program_manager: EdgeProgramManager = to_edge(
-    aten_model, compile_config=EdgeCompileConfig(_check_ir_validity=True)
-)
-edge_program_manager_copy = copy.deepcopy(edge_program_manager)
-et_program_manager: ExecutorchProgramManager = edge_program_manager.to_executorch()
-
-
-# Generate ETRecord
-etrecord_path = "etrecord.bin"
-generate_etrecord(etrecord_path, edge_program_manager_copy, et_program_manager)
-
-# sphinx_gallery_start_ignore
-from unittest.mock import patch
-
-# sphinx_gallery_end_ignore
-
-######################################################################
-#
-# .. warning::
-#    Users should do a deepcopy of the output of ``to_edge()`` and pass in the
-#    deepcopy to the ``generate_etrecord`` API. This is needed because the
-#    subsequent call, ``to_executorch()``, does an in-place mutation and will
-#    lose debug data in the process.
-#
-
-######################################################################
-# Generate ETDump
-# ---------------
-#
-# Next step is to generate an ``ETDump``. ``ETDump`` contains runtime results
-# from executing a `Bundled Program Model <../sdk-bundled-io.html>`__.
-#
-# In this tutorial, a `Bundled Program` is created from the example model above.
-
-import torch
-
-from executorch.exir import to_edge
-from executorch.sdk import BundledProgram
-
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
-from torch.export import export
-
-# Step 1: ExecuTorch Program Export
-m_name = "forward"
-method_graphs = {m_name: export(model, (torch.randn(1, 1, 32, 32),))}
-
-# Step 2: Construct Method Test Suites
-inputs = [[torch.randn(1, 1, 32, 32)] for _ in range(2)]
-
-method_test_suites = [
-    MethodTestSuite(
-        method_name=m_name,
-        test_cases=[
-            MethodTestCase(inputs=inp, expected_outputs=getattr(model, m_name)(*inp))
-            for inp in inputs
-        ],
-    )
-]
-
-# Step 3: Generate BundledProgram
-executorch_program = to_edge(method_graphs).to_executorch()
-bundled_program = BundledProgram(executorch_program, method_test_suites)
-
-# Step 4: Serialize BundledProgram to flatbuffer.
-serialized_bundled_program = serialize_from_bundled_program_to_flatbuffer(
-    bundled_program
-)
-save_path = "bundled_program.bp"
-with open(save_path, "wb") as f:
-    f.write(serialized_bundled_program)
-
-######################################################################
-# Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake) to execute the Bundled Program to generate the ``ETDump``::
-#
-#       cd executorch
-#       ./examples/sdk/build_sdk_example_runner.sh
-#       cmake-out/examples/sdk/sdk_example_runner --bundled_program_path="bundled_program.bp"
-
-######################################################################
-# Creating an Inspector
-# ---------------------
-#
-# Final step is to create the ``Inspector`` by passing in the artifact paths.
-# Inspector takes the runtime results from ``ETDump`` and correlates them to
-# the operators of the Edge Dialect Graph.
-#
-# Recall: An ``ETRecord`` is not required. If an ``ETRecord`` is not provided,
-# the Inspector will show runtime results without operator correlation.
-#
-# To visualize all runtime events, call Inspector's ``print_data_tabular``.
-
-from executorch.sdk import Inspector
-
-# sphinx_gallery_start_ignore
-inspector_patch = patch.object(Inspector, "__init__", return_value=None)
-inspector_patch_print = patch.object(Inspector, "print_data_tabular", return_value="")
-inspector_patch.start()
-inspector_patch_print.start()
-# sphinx_gallery_end_ignore
-etdump_path = "etdump.etdp"
-inspector = Inspector(etdump_path=etdump_path, etrecord=etrecord_path)
-# sphinx_gallery_start_ignore
-inspector.event_blocks = []
-# sphinx_gallery_end_ignore
-inspector.print_data_tabular()
-
-# sphinx_gallery_start_ignore
-inspector_patch.stop()
-inspector_patch_print.stop()
-# sphinx_gallery_end_ignore
-
-######################################################################
-# Analyzing with an Inspector
-# ---------------------------
-#
-# ``Inspector`` provides 2 ways of accessing ingested information: `EventBlocks <../sdk-inspector#eventblock-class>`__
-# and ``DataFrames``. These mediums give users the ability to perform custom
-# analysis about their model performance.
-#
-# Below are examples usages, with both ``EventBlock`` and ``DataFrame`` approaches.
-
-# Set Up
-import pprint as pp
-
-import pandas as pd
-
-pd.set_option("display.max_colwidth", None)
-pd.set_option("display.max_columns", None)
-
-######################################################################
-# If a user wants the raw profiling results, they would do something similar to
-# finding the raw runtime data of an ``addmm.out`` event.
-
-for event_block in inspector.event_blocks:
-    # Via EventBlocks
-    for event in event_block.events:
-        if event.name == "native_call_addmm.out":
-            print(event.name, event.perf_data.raw)
-
-    # Via Dataframe
-    df = event_block.to_dataframe()
-    df = df[df.event_name == "native_call_addmm.out"]
-    print(df[["event_name", "raw"]])
-    print()
-
-######################################################################
-# If a user wants to trace an operator back to their model code, they would do
-# something similar to finding the module hierarchy and stack trace of the
-# slowest ``convolution.out`` call.
-
-for event_block in inspector.event_blocks:
-    # Via EventBlocks
-    slowest = None
-    for event in event_block.events:
-        if event.name == "native_call_convolution.out":
-            if slowest is None or event.perf_data.p50 > slowest.perf_data.p50:
-                slowest = event
-    if slowest is not None:
-        print(slowest.name)
-        print()
-        pp.pprint(slowest.stack_traces)
-        print()
-        pp.pprint(slowest.module_hierarchy)
-
-    # Via Dataframe
-    df = event_block.to_dataframe()
-    df = df[df.event_name == "native_call_convolution.out"]
-    if len(df) > 0:
-        slowest = df.loc[df["p50"].idxmax()]
-        print(slowest.event_name)
-        print()
-        pp.pprint(slowest.stack_traces)
-        print()
-        pp.pprint(slowest.module_hierarchy)
-
-######################################################################
-# If a user wants the total runtime of a module, they can use
-# ``find_total_for_module``.
-
-print(inspector.find_total_for_module("L__self__"))
-print(inspector.find_total_for_module("L__self___conv2"))
-
-######################################################################
-# Note: ``find_total_for_module`` is a special first class method of
-# `Inspector <../sdk-inspector.html>`__
-
-######################################################################
-# Conclusion
-# ----------
-#
-# In this tutorial, we learned about the steps required to consume an ExecuTorch
-# model with the ExecuTorch SDK. It also showed how to use the Inspector APIs
-# to analyze the model run results.
-#
-# Links Mentioned
-# ^^^^^^^^^^^^^^^
-#
-# - `ExecuTorch SDK <../sdk-overview.html>`__
-# - `ETRecord <../sdk-etrecord.html>`__
-# - `ETDump <../sdk-etdump.html>`__
-# - `Inspector <../sdk-inspector.html>`__
diff --git a/docs/website/docs/tutorials/bundled_program.md b/docs/website/docs/tutorials/bundled_program.md
index ac67d6f6285..e477d8e6a61 100644
--- a/docs/website/docs/tutorials/bundled_program.md
+++ b/docs/website/docs/tutorials/bundled_program.md
@@ -49,19 +49,15 @@ Error GetProgramData(
 
 Here's an example of how to use the GetProgramData API:
 ```c++
-  std::shared_ptr<char> buff_ptr;
-  size_t buff_len;
-
-// FILE_PATH here can be either BundledProgram or Program flatbuffer file.
-  Error status = torch::executor::util::read_file_content(
-      FILE_PATH, &buff_ptr, &buff_len);
-  ET_CHECK_MSG(
-      status == Error::Ok,
-      "read_file_content() failed with status 0x%" PRIx32,
-      status);
-
-  uint32_t prof_tok = EXECUTORCH_BEGIN_PROF("de-serialize model");
-
+  // Assume that the user has read the contents of the file into file_data using
+  // whatever method works best for their application. The file could contain
+  // either BundledProgram data or Program data.
+  void* file_data = ...;
+  size_t file_data_len = ...;
+
+  // If file_data contains a BundledProgram, GetProgramData() will return a
+  // pointer to the Program data embedded inside it. Otherwise it will return
+  // file_data, which already pointed to Program data.
   const void* program_ptr;
   size_t program_len;
   status = torch::executor::bundled_program::GetProgramData(
@@ -122,14 +118,13 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput(
 
 ### Example
 
-Here we provide an example about how to run the bundled program step by step. Most of the code are borrowed from "fbcode/executorch/sdk/fb/runners/executor_runner.cpp" and please review that file if you need more info and context:
+Here we provide an example about how to run the bundled program step by step.
 
 ```c++
     // method_name is the name for the method we want to test
     // memory_manager is the executor::MemoryManager variable for executor memory allocation.
     // program is the executorch program.
     Result<Method> method = program->load_method(method_name, &memory_manager);
-    EXECUTORCH_END_PROF(prof_tok);
     ET_CHECK_MSG(
         method.ok(),
         "load_method() failed with status 0x%" PRIx32,
diff --git a/examples/README.md b/examples/README.md
index f36e873e843..e3a18cf5a0a 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -13,7 +13,7 @@ examples
 ├── models                            # Contains a set of popular and representative PyTorch models
 ├── portable                          # Contains end-to-end demos for ExecuTorch in portable mode
 ├── selective_build                   # Contains demos of selective build for optimizing the binary size of the ExecuTorch runtime
-├── sdk                               # Contains demos of BundledProgram and ETDump
+├── devtools                          # Contains demos of BundledProgram and ETDump
 ├── demo-apps                         # Contains demo apps for Android and iOS
 ├── xnnpack                           # Contains end-to-end ExecuTorch demos with first-party optimization using XNNPACK
 ├── apple
@@ -35,13 +35,17 @@ A user's journey may commence by exploring the demos located in the [`portable/`
 
 [This page](./models/llama2/README.md) demonstrates how to run Llama 2 7B and Llama 3 8B models on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
 
+## Demo of Llava1.5 7B
+
+[This page](./models/llava/README.md) demonstrates how to run [Llava 1.5 7B](https://github.com/haotian-liu/LLaVA) model on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
+
 ## Demo of Selective Build
 
 To understand how to deploy the ExecuTorch runtime with optimization for binary size, explore the demos available in the [`selective_build/`](./selective_build) directory. These demos are specifically designed to illustrate the [Selective Build](../docs/source/kernel-library-selective_build.md), offering insights into reducing the binary size while maintaining efficiency.
 
-## Demo of ExecuTorch SDK
+## Demo of ExecuTorch Developer Tools
 
-You will find demos of [ExecuTorch SDK](./sdk/) in the [`sdk/`](./sdk/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data.
+You will find demos of [ExecuTorch Developer Tools](./devtools/) in the [`devtools/`](./devtools/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data.
 
 ## Demo Apps
 
diff --git a/examples/apple/coreml/executor_runner/main.mm b/examples/apple/coreml/executor_runner/main.mm
index 4cc21ba30a2..405bfb9c6c4 100644
--- a/examples/apple/coreml/executor_runner/main.mm
+++ b/examples/apple/coreml/executor_runner/main.mm
@@ -13,8 +13,7 @@
 #import <executorch/runtime/executor/program.h>
 #import <executorch/runtime/platform/log.h>
 #import <executorch/runtime/platform/runtime.h>
-#import <executorch/sdk/etdump/etdump_flatcc.h>
-#import <executorch/util/util.h>
+#import <executorch/devtools/etdump/etdump_flatcc.h>
 #import <memory>
 #import <numeric>
 #import <string>
@@ -25,8 +24,25 @@ static inline id check_class(id obj, Class cls) {
 
 #define SAFE_CAST(Object, Type) ((Type *)check_class(Object, [Type class]))
 
-using namespace torch::executor;
-using torch::executor::util::FileDataLoader;
+using executorch::etdump::ETDumpGen;
+using executorch::etdump::ETDumpResult;
+using executorch::extension::FileDataLoader;
+using executorch::runtime::DataLoader;
+using executorch::runtime::EValue;
+using executorch::runtime::Error;
+using executorch::runtime::EventTracer;
+using executorch::runtime::EventTracerDebugLogLevel;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::TensorInfo;
+using torch::executor::CoreMLBackendDelegate;
 
 static constexpr size_t kRuntimeMemorySize = 16 * 1024U * 1024U; // 16 MB
 
@@ -295,7 +311,7 @@ bool is_model_analysis_enabled(const Args& args) {
 }
 
 void dump_etdump_gen(ETDumpGen *etdump_gen, const Buffer& debug_buffer, const Args& args) {
-    etdump_result result = (etdump_gen != nullptr) ? etdump_gen->get_etdump_data() : etdump_result{.buf = nullptr, .size = 0};
+    ETDumpResult result = (etdump_gen != nullptr) ? etdump_gen->get_etdump_data() : ETDumpResult{.buf = nullptr, .size = 0};
     if (result.size == 0) {
         return;
     }
@@ -317,7 +333,7 @@ void dump_etdump_gen(ETDumpGen *etdump_gen, const Buffer& debug_buffer, const Ar
 
 int main(int argc, char * argv[]) {
     @autoreleasepool {
-        runtime_init();
+        executorch::runtime::runtime_init();
 
         auto args = parse_command_line_args([[NSProcessInfo processInfo] arguments]);
         if (args.purge_models_cache) {
diff --git a/examples/apple/coreml/scripts/build_executor_runner.sh b/examples/apple/coreml/scripts/build_executor_runner.sh
index 16c5dea02a4..89cd45ea6b1 100755
--- a/examples/apple/coreml/scripts/build_executor_runner.sh
+++ b/examples/apple/coreml/scripts/build_executor_runner.sh
@@ -36,7 +36,7 @@ cmake "$EXECUTORCH_ROOT_PATH" -B"$CMAKE_BUILD_DIR_PATH" \
 -DFLATC_EXECUTABLE="$(which flatc)" \
 -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
 -DEXECUTORCH_BUILD_XNNPACK=OFF \
--DEXECUTORCH_BUILD_SDK=ON \
+-DEXECUTORCH_BUILD_DEVTOOLS=ON \
 -DEXECUTORCH_BUILD_COREML=ON \
 -Dprotobuf_BUILD_TESTS=OFF \
 -Dprotobuf_BUILD_EXAMPLES=OFF \
@@ -56,7 +56,7 @@ mkdir -p "$EXECUTORCH_INCLUDE_DIR_PATH"
 find extension \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \;
 find runtime \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \;
 find util \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \;
-find sdk \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \;
+find devtools \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \;
 cp -rf "$COREML_DIR_PATH/runtime/include/" "$INCLUDE_DIR_PATH"
 
 # Copy required libraries
diff --git a/examples/apple/coreml/scripts/debugger_cli.py b/examples/apple/coreml/scripts/debugger_cli.py
index cb978de0746..88390f8d8cb 100644
--- a/examples/apple/coreml/scripts/debugger_cli.py
+++ b/examples/apple/coreml/scripts/debugger_cli.py
@@ -24,7 +24,7 @@ def get_root_dir_path() -> Path:
 sys.path.append(str((get_root_dir_path() / "examples").resolve()))
 
 from inspector_utils import (
-    build_sdk_runner_including_coreml,
+    build_devtools_runner_including_coreml,
     ComparisonResult,
     create_inspector_coreml,
     create_inspector_reference,
@@ -145,7 +145,7 @@ def main() -> None:
             f"Valid compute units are {valid_compute_units}."
         )
 
-    build_sdk_runner_including_coreml(
+    build_devtools_runner_including_coreml(
         root_dir_path=get_root_dir_path(), conda_env_name=args.conda_environment_name
     )
 
diff --git a/examples/apple/coreml/scripts/export.py b/examples/apple/coreml/scripts/export.py
index 4bf26a7f3ea..e906c0704cb 100644
--- a/examples/apple/coreml/scripts/export.py
+++ b/examples/apple/coreml/scripts/export.py
@@ -17,10 +17,10 @@
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
+from executorch.devtools.etrecord import generate_etrecord
 from executorch.exir import to_edge
 
 from executorch.exir.backend.backend_api import to_backend
-from executorch.sdk.etrecord import generate_etrecord
 from torch.export import export
 
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent.parent
@@ -104,11 +104,7 @@ def export_lowered_module_to_executorch_program(lowered_module, example_inputs):
     lowered_module(*example_inputs)
     exec_prog = to_edge(
         export(lowered_module, example_inputs), compile_config=_EDGE_COMPILE_CONFIG
-    ).to_executorch(
-        config=exir.ExecutorchBackendConfig(
-            extract_constant_segment=False, extract_delegate_segments=True
-        )
-    )
+    ).to_executorch(config=exir.ExecutorchBackendConfig(extract_delegate_segments=True))
 
     return exec_prog
 
@@ -178,9 +174,7 @@ def generate_compile_specs_from_args(args):
         )
         delegated_program_manager = edge_program_manager.to_backend(partitioner)
         exec_program = delegated_program_manager.to_executorch(
-            config=exir.ExecutorchBackendConfig(
-                extract_constant_segment=False, extract_delegate_segments=True
-            )
+            config=exir.ExecutorchBackendConfig(extract_delegate_segments=True)
         )
     else:
         lowered_module, edge_copy = lower_module_to_coreml(
diff --git a/examples/apple/coreml/scripts/inspector_cli.py b/examples/apple/coreml/scripts/inspector_cli.py
index 768465f770a..c63d4791fcf 100644
--- a/examples/apple/coreml/scripts/inspector_cli.py
+++ b/examples/apple/coreml/scripts/inspector_cli.py
@@ -8,8 +8,8 @@
 
 from pathlib import Path
 
-from executorch.sdk import Inspector
-from executorch.sdk.inspector._inspector_utils import compare_results
+from executorch.devtools import Inspector
+from executorch.devtools.inspector import compare_results
 
 
 def get_root_dir_path() -> Path:
diff --git a/examples/apple/coreml/scripts/inspector_utils.py b/examples/apple/coreml/scripts/inspector_utils.py
index 1736c2cefbf..08af6fb3484 100644
--- a/examples/apple/coreml/scripts/inspector_utils.py
+++ b/examples/apple/coreml/scripts/inspector_utils.py
@@ -20,6 +20,13 @@
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
 
+from executorch.devtools import BundledProgram, generate_etrecord, Inspector
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
+from executorch.devtools.inspector import Event
+
 from executorch.exir import (
     EdgeProgramManager,
     ExecutorchBackendConfig,
@@ -30,14 +37,6 @@
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.tracer import Value
 
-from executorch.sdk import BundledProgram, generate_etrecord, Inspector
-
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
-from executorch.sdk.inspector import Event
-
 from torch.export import export, ExportedProgram
 
 COREML_METADATA_KEYS: Final[List[Tuple[str, str]]] = [
@@ -48,26 +47,26 @@
 ]
 
 
-def build_sdk_runner_including_coreml(
+def build_devtools_runner_including_coreml(
     root_dir_path: Path,
     conda_env_name: str,
     force: bool = False,
 ):
     if not force:
-        sdk_executable_path = (
-            root_dir_path / "cmake-out" / "examples" / "sdk" / "sdk_example_runner"
+        devtools_executable_path = (
+            root_dir_path / "cmake-out" / "examples" / "devtools" / "example_runner"
         )
-        print(sdk_executable_path)
-        if sdk_executable_path.is_file():
+        print(devtools_executable_path)
+        if devtools_executable_path.is_file():
             return
 
     cd_root_command: str = f"cd {root_dir_path.resolve()}"
     conda_activate_env_command: str = f"source conda activate {conda_env_name}"
-    build_sdk_runner_command: str = (
-        "./examples/sdk/build_sdk_example_runner.sh --coreml"
+    build_devtools_runner_command: str = (
+        "./examples/devtools/build_example_runner.sh --coreml"
     )
     build_command: str = (
-        f"{cd_root_command} && {conda_activate_env_command} && {build_sdk_runner_command}"
+        f"{cd_root_command} && {conda_activate_env_command} && {build_devtools_runner_command}"
     )
     subprocess.run(
         f'bash -c "{build_command}"', shell=True, check=True
@@ -80,7 +79,6 @@ def build_sdk_runner_including_coreml(
 )
 
 _EDGE_BACKEND_CONFIG = exir.ExecutorchBackendConfig(
-    extract_constant_segment=False,
     extract_delegate_segments=True,
 )
 
@@ -175,22 +173,24 @@ def generate_etdump_with_intermediate_values(
     debug_buffer_path: Path,
     debug_buffer_size: int,
 ):
-    sdk_executable_path = (
-        root_dir_path / "cmake-out" / "examples" / "sdk" / "sdk_example_runner"
+    devtools_executable_path = (
+        root_dir_path / "cmake-out" / "examples" / "devtools" / "example_runner"
     )
-    if not sdk_executable_path.is_file():
+    if not devtools_executable_path.is_file():
         raise FileNotFoundError(
-            errno.ENOENT, os.strerror(errno.ENOENT), str(sdk_executable_path.resolve())
+            errno.ENOENT,
+            os.strerror(errno.ENOENT),
+            str(devtools_executable_path.resolve()),
         )
 
-    sdk_runner_command: str = f"""
-    {sdk_executable_path.resolve()} -dump_intermediate_outputs\
+    devtools_runner_command: str = f"""
+    {devtools_executable_path.resolve()} -dump_intermediate_outputs\
     -bundled_program_path {bundled_program_path.resolve()}\
     -etdump_path {et_dump_path.resolve()}\
     -debug_output_path {debug_buffer_path.resolve()}\
     -debug_buffer_size {debug_buffer_size}"""
     subprocess.run(
-        f'bash -c "{sdk_runner_command}"', shell=True, check=True
+        f'bash -c "{devtools_runner_command}"', shell=True, check=True
     ).check_returncode()
 
 
diff --git a/examples/apple/mps/CMakeLists.txt b/examples/apple/mps/CMakeLists.txt
index d1dd8e93d7e..319d8159ced 100644
--- a/examples/apple/mps/CMakeLists.txt
+++ b/examples/apple/mps/CMakeLists.txt
@@ -92,8 +92,8 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
   include(${EXECUTORCH_SRCS_FILE})
   target_include_directories(
     bundled_program
-    INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../../sdk/include
-              ${CMAKE_CURRENT_BINARY_DIR}/../../../sdk/bundled_program
+    INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../../devtools/include
+              ${CMAKE_CURRENT_BINARY_DIR}/../../../devtools/bundled_program
               ${EXECUTORCH_ROOT}/third-party/flatbuffers/include
               ${EXECUTORCH_ROOT}/third-party/flatcc/include
               ${_mps_schema_headers}
diff --git a/examples/apple/mps/README.md b/examples/apple/mps/README.md
index bebd1329be4..dc01d585f84 100644
--- a/examples/apple/mps/README.md
+++ b/examples/apple/mps/README.md
@@ -30,7 +30,7 @@ Once we have the model binary file, then let's run it with the ExecuTorch runtim
 # Build and install executorch
 cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
-          -DEXECUTORCH_BUILD_SDK=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
           -DEXECUTORCH_BUILD_MPS=ON \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
diff --git a/examples/apple/mps/executor_runner/mps_executor_runner.mm b/examples/apple/mps/executor_runner/mps_executor_runner.mm
index 604419a620e..e3d0e2978b6 100644
--- a/examples/apple/mps/executor_runner/mps_executor_runner.mm
+++ b/examples/apple/mps/executor_runner/mps_executor_runner.mm
@@ -30,8 +30,8 @@
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/sdk/bundled_program/bundled_program.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
+#include <executorch/devtools/bundled_program/bundled_program.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 
 #include <chrono>
 using namespace std::chrono;
@@ -97,8 +97,26 @@
     262144, // 256 KB
     "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging.");
 
-using namespace torch::executor;
-using torch::executor::util::FileDataLoader;
+using executorch::etdump::ETDumpGen;
+using executorch::etdump::ETDumpResult;
+using executorch::extension::BufferCleanup;
+using executorch::extension::BufferDataLoader;
+using executorch::extension::FileDataLoader;
+using executorch::runtime::DataLoader;
+using executorch::runtime::EValue;
+using executorch::runtime::Error;
+using executorch::runtime::EventTracerDebugLogLevel;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+
+namespace bundled_program = executorch::bundled_program;
 
 int main(int argc, char** argv) {
   {
@@ -113,7 +131,7 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  runtime_init();
+  executorch::runtime::runtime_init();
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (argc != 1) {
@@ -144,20 +162,20 @@ int main(int argc, char** argv) {
   // Find the offset to the embedded Program.
   const void* program_data;
   size_t program_data_len;
-  Error status = torch::executor::bundled_program::GetProgramData(
+  Error status = bundled_program::get_program_data(
       const_cast<void*>(file_data->data()),
       file_data->size(),
       &program_data,
       &program_data_len);
   ET_CHECK_MSG(
       status == Error::Ok,
-      "GetProgramData() failed on file '%s': 0x%x",
+      "get_program_data() failed on file '%s': 0x%x",
       model_path,
       (unsigned int)status);
 
   // Wrap the buffer in a DataLoader.
   auto buffer_data_loader =
-      util::BufferDataLoader(program_data, program_data_len);
+      BufferDataLoader(program_data, program_data_len);
 
   // Parse the program file. This is immutable, and can also be reused between
   // multiple execution invocations across multiple threads.
@@ -239,7 +257,7 @@ HierarchicalAllocator planned_memory(
   // be used by a single thread at at time, but it can be reused.
   //
 
-  torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+  ETDumpGen etdump_gen;
   Result<Method> method =
       program->load_method(method_name, &memory_manager, &etdump_gen);
   ET_CHECK_MSG(
@@ -263,11 +281,11 @@ HierarchicalAllocator planned_memory(
   }
 
   // Prepare the inputs.
-  std::unique_ptr<util::BufferCleanup> inputs;
+  std::unique_ptr<BufferCleanup> inputs;
   if (FLAGS_bundled_program) {
     ET_LOG(Info, "Loading bundled program...");
     // Use the inputs embedded in the bundled program.
-    status = torch::executor::bundled_program::LoadBundledInput(
+    status = bundled_program::load_bundled_input(
         *method,
         file_data->data(),
         FLAGS_testset_idx);
@@ -278,11 +296,11 @@ HierarchicalAllocator planned_memory(
   } else {
     ET_LOG(Info, "Loading non-bundled program...\n");
     // Use ones-initialized inputs.
-    auto inputs_result = torch::executor::util::prepare_input_tensors(*method);
+    auto inputs_result = executorch::extension::prepare_input_tensors(*method);
     if (inputs_result.ok()) {
       // Will free the inputs when destroyed.
       inputs =
-          std::make_unique<util::BufferCleanup>(std::move(inputs_result.get()));
+          std::make_unique<BufferCleanup>(std::move(inputs_result.get()));
     }
   }
   ET_LOG(Info, "Inputs prepared.");
@@ -322,14 +340,14 @@ HierarchicalAllocator planned_memory(
   status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
   // Print the first and last 100 elements of long lists of scalars.
-  std::cout << torch::executor::util::evalue_edge_items(100);
+  std::cout << executorch::extension::evalue_edge_items(100);
   for (int i = 0; i < outputs.size(); ++i) {
     std::cout << "Output " << i << ": " << outputs[i] << std::endl;
   }
 
   // Dump the etdump data containing profiling/debugging data to the specified
   // file.
-  etdump_result result = etdump_gen.get_etdump_data();
+  ETDumpResult result = etdump_gen.get_etdump_data();
   if (result.buf != nullptr && result.size > 0) {
     FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+");
     fwrite((uint8_t*)result.buf, 1, result.size, f);
@@ -362,7 +380,7 @@ HierarchicalAllocator planned_memory(
       atol = 1e-01;
       rtol = 1e-01;
     }
-    status = torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput(
+    status = bundled_program::verify_method_outputs(
         *method,
         file_data->data(),
         FLAGS_testset_idx,
diff --git a/examples/apple/mps/executor_runner/targets.bzl b/examples/apple/mps/executor_runner/targets.bzl
index fd0a7a50468..14399411ae3 100644
--- a/examples/apple/mps/executor_runner/targets.bzl
+++ b/examples/apple/mps/executor_runner/targets.bzl
@@ -28,9 +28,9 @@ def define_common_targets():
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/kernels/portable:generated_lib",
                 "//executorch/extension/data_loader:file_data_loader",
-                "//executorch/sdk/etdump:etdump_flatcc",
+                "//executorch/devtools/etdump:etdump_flatcc",
                 "//executorch/extension/data_loader:buffer_data_loader",
-                "//executorch/sdk/bundled_program:runtime",
+                "//executorch/devtools/bundled_program:runtime",
             ],
             external_deps = [
                 "gflags",
diff --git a/examples/apple/mps/scripts/build_mps_executor_runner.sh b/examples/apple/mps/scripts/build_mps_executor_runner.sh
index 16754588b67..31ab54fd4d3 100755
--- a/examples/apple/mps/scripts/build_mps_executor_runner.sh
+++ b/examples/apple/mps/scripts/build_mps_executor_runner.sh
@@ -41,7 +41,7 @@ rm -rf "$OUTPUT"
 cmake -DBUCK2="$BUCK" \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE="$MODE" \
-          -DEXECUTORCH_BUILD_SDK=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
           -DEXECUTORCH_BUILD_MPS=ON \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py
index e561afb1858..d6416e0ffc8 100644
--- a/examples/apple/mps/scripts/mps_example.py
+++ b/examples/apple/mps/scripts/mps_example.py
@@ -14,6 +14,11 @@
 from executorch import exir
 from executorch.backends.apple.mps import MPSBackend
 from executorch.backends.apple.mps.partition import MPSPartitioner
+from executorch.devtools import BundledProgram, generate_etrecord
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
 
 from executorch.exir import (
     EdgeCompileConfig,
@@ -24,11 +29,6 @@
 from executorch.exir.backend.backend_details import CompileSpec
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.extension.export_util.utils import export_to_edge, save_pte_program
-from executorch.sdk import BundledProgram, generate_etrecord
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
 
 from ....models import MODEL_NAME_TO_MODEL
 from ....models.model_factory import EagerModelFactory
@@ -183,9 +183,7 @@ def get_model_config(args):
         logging.info(f"Lowered graph:\n{edge.exported_program().graph}")
 
         executorch_program = edge.to_executorch(
-            config=ExecutorchBackendConfig(
-                extract_delegate_segments=False, extract_constant_segment=False
-            )
+            config=ExecutorchBackendConfig(extract_delegate_segments=False)
         )
     else:
         lowered_module = to_backend(
@@ -195,11 +193,7 @@ def get_model_config(args):
             lowered_module,
             example_inputs,
             edge_compile_config=exir.EdgeCompileConfig(_check_ir_validity=False),
-        ).to_executorch(
-            config=ExecutorchBackendConfig(
-                extract_delegate_segments=False, extract_constant_segment=False
-            )
-        )
+        ).to_executorch(config=ExecutorchBackendConfig(extract_delegate_segments=False))
 
     model_name = f"{args.model_name}_mps"
 
diff --git a/examples/apple/mps/test_mps.sh b/examples/apple/mps/test_mps.sh
index 55712089e07..555161dd3f7 100755
--- a/examples/apple/mps/test_mps.sh
+++ b/examples/apple/mps/test_mps.sh
@@ -11,14 +11,14 @@ set -e
 
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/../../../.ci/scripts/utils.sh"
-cmake_install_executorch_sdk_lib() {
+cmake_install_executorch_devtools_lib() {
   echo "Installing libexecutorch.a, libportable_kernels.a, libetdump.a, libbundled_program.a"
   rm -rf cmake-out
 
   retry cmake -DBUCK2="$BUCK" \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
-          -DEXECUTORCH_BUILD_SDK=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
           -DEXECUTORCH_BUILD_MPS=ON \
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
@@ -60,5 +60,5 @@ then
 fi
 
 
-cmake_install_executorch_sdk_lib
+cmake_install_executorch_devtools_lib
 test_cmake_mps
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index f854a081fa6..9a45195e58f 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -214,7 +214,11 @@ def forward(self, x):
         edge = edge.to_backend(
             ArmPartitioner(
                 ArmCompileSpecBuilder()
-                .ethosu_compile_spec("ethos-u55-128")
+                .ethosu_compile_spec(
+                    "ethos-u55-128",
+                    system_config="Ethos_U55_High_End_Embedded",
+                    memory_mode="Shared_Sram",
+                )
                 .set_permute_memory_format(
                     args.model_name in MODEL_NAME_TO_MODEL.keys()
                 )
@@ -226,9 +230,7 @@ def forward(self, x):
 
     try:
         exec_prog = edge.to_executorch(
-            config=ExecutorchBackendConfig(
-                extract_delegate_segments=False, extract_constant_segment=False
-            )
+            config=ExecutorchBackendConfig(extract_delegate_segments=False)
         )
     except RuntimeError as e:
         if "Missing out variants" in str(e.args[0]):
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index 1f42eda9fbc..68c5435dffe 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -9,25 +9,38 @@ project(arm_executor_runner)
 option(SEMIHOSTING "Enable semihosting" OFF)
 
 if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING})
-  message(FATAL_ERROR
-    "ET_PTE_FILE_PATH must specify a model .pte, for bare metal systems the "
-    "model is built into the binary.")
+  message(
+    FATAL_ERROR
+      "ET_PTE_FILE_PATH must specify a model .pte, for bare metal systems the "
+      "model is built into the binary."
+  )
 endif()
 
 # Example ExecuTorch demo for bare metal Cortex-M based systems
-set(ET_DIR_PATH "../../.." CACHE PATH
-  "Path to ExecuTorch dir")
-set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH
-  "Path to ExecuTorch build dir")
-set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH
-  "Path to ExecuTorch headers")
-set(ET_PTE_FILE_PATH "" CACHE PATH
-  "Path to ExecuTorch model pte")
-set(ETHOS_SDK_PATH "${ET_DIR_PATH}/examples/arm/ethos-u-scratch/ethos-u" CACHE PATH
-  "Path to Ethos-U bare metal driver/env")
-set(PYTHON_EXECUTABLE "python" CACHE PATH
-  "Define to override python executable used")
-
+set(ET_DIR_PATH
+    "../../.."
+    CACHE PATH "Path to ExecuTorch dir"
+)
+set(ET_BUILD_DIR_PATH
+    "${ET_DIR_PATH}/cmake-out"
+    CACHE PATH "Path to ExecuTorch build dir"
+)
+set(ET_INCLUDE_PATH
+    "${ET_DIR_PATH}/.."
+    CACHE PATH "Path to ExecuTorch headers"
+)
+set(ET_PTE_FILE_PATH
+    ""
+    CACHE PATH "Path to ExecuTorch model pte"
+)
+set(ETHOS_SDK_PATH
+    "${ET_DIR_PATH}/examples/arm/ethos-u-scratch/ethos-u"
+    CACHE PATH "Path to Ethos-U bare metal driver/env"
+)
+set(PYTHON_EXECUTABLE
+    "python"
+    CACHE PATH "Define to override python executable used"
+)
 
 get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH)
 get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH)
@@ -104,23 +117,25 @@ set_property(
 
 # Convert pte to header
 if(NOT ${SEMIHOSTING})
-  add_custom_target(gen_model_header
-      DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h)
+  add_custom_target(
+    gen_model_header DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h
+  )
 
   add_custom_command(
-      OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h
-      COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/pte_to_header.py
-      --pte ${ET_PTE_FILE_PATH}
-      --outdir ${CMAKE_CURRENT_BINARY_DIR}
-      DEPENDS ${ET_PTE_FILE_PATH}
-      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h
+    COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/pte_to_header.py --pte
+            ${ET_PTE_FILE_PATH} --outdir ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS ${ET_PTE_FILE_PATH}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
   )
 endif()
 
 # The arm_executor_runner executable
 add_executable(arm_executor_runner)
 
-target_sources(arm_executor_runner PRIVATE arm_executor_runner.cpp)
+target_sources(
+  arm_executor_runner PRIVATE arm_executor_runner.cpp arm_perf_monitor.cpp
+)
 
 # Include the target's bare-metal linker script
 ethosu_eval_link_options(arm_executor_runner)
@@ -146,19 +161,17 @@ target_include_directories(
   arm_executor_runner PRIVATE ${ET_INCLUDE_PATH} ${CMAKE_CURRENT_BINARY_DIR}
 )
 
-
-
 if(SEMIHOSTING)
-target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING)
+  target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING)
 else()
-add_dependencies(arm_executor_runner gen_model_header)
+  add_dependencies(arm_executor_runner gen_model_header)
 endif()
 
 # Fixup compilation of retarget.c
 if(SEMIHOSTING)
-# Remove this when MLBEDSW-8910 is closed.
-set_source_files_properties(
-  ${ETHOS_SDK_PATH}/core_platform/targets/corstone-300/retarget.c
-  PROPERTIES HEADER_FILE_ONLY TRUE
-)
+  # Remove this when MLBEDSW-8910 is closed.
+  set_source_files_properties(
+    ${ETHOS_SDK_PATH}/core_platform/targets/corstone-300/retarget.c
+    PROPERTIES HEADER_FILE_ONLY TRUE
+  )
 endif()
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
index 6256ff47cf6..9ca3ebcdc7c 100644
--- a/examples/arm/executor_runner/arm_executor_runner.cpp
+++ b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -20,36 +20,59 @@
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/runtime.h>
 
-/**
- * This header file is generated by the build process based on the .pte file
- * specified in the ET_PTE_FILE_PATH variable to the cmake build.
- * Control of the action of the .pte, it's use of operators and delegates, and
- * which are included in the bare metal build are also orchestrated by the
- * CMakeLists file. For example use see examples/arm/run.sh
- */
+#include "arm_perf_monitor.h"
+
 #ifdef SEMIHOSTING
-// TODO: Verify the section attribute to match the linker script
-//       pending MLETORCH-39
-const size_t input_allocation_pool_size = 1 * 1024 * 1024;
+// In our unit test flow, we have the capability to provide an enitre model to
+// the Corstone-3xx FVP using semi hosting. Hence, the input allocation pool
+// needs to be large enough to take an entire model. On the FVP,
+// network_model_sec is linked to the DDR, which is large (256MB on
+// Corstone-300).
+const size_t input_allocation_pool_size = 100 * 1024 * 1024;
 unsigned char __attribute__((
     section("network_model_sec"),
     aligned(16))) input_allocation_pool[input_allocation_pool_size];
 // memory for the model will be allocated from the input_allocation_pool
 char* model_pte = nullptr;
 #else
+/**
+ * This header file is generated by the build process based on the .pte file
+ * specified in the ET_PTE_FILE_PATH variable to the cmake build.
+ * Control of the action of the .pte, it's use of operators and delegates, and
+ * which are included in the bare metal build are also orchestrated by the
+ * CMakeLists file. For example use see examples/arm/run.sh
+ */
 #include "model_pte.h"
 #endif
 
-using namespace exec_aten;
-using namespace std;
-using torch::executor::Error;
-using torch::executor::Result;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::extension::BufferCleanup;
+using executorch::extension::BufferDataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::Tag;
+using executorch::runtime::TensorInfo;
 
 #define METHOD_ALLOCATOR_POOL_SIZE (70 * 1024 * 1024)
 unsigned char __attribute__((
     section("network_model_sec"),
     aligned(16))) method_allocation_pool[METHOD_ALLOCATOR_POOL_SIZE];
 
+const size_t temp_allocation_pool_size = 1 * 1024 * 1024;
+unsigned char __attribute__((
+    section("network_model_sec"),
+    aligned(16))) temp_allocation_pool[temp_allocation_pool_size];
+
 void et_pal_init(void) {}
 
 ET_NORETURN void et_pal_abort(void) {
@@ -71,24 +94,26 @@ void et_pal_emit_log_message(
     size_t line,
     const char* message,
     ET_UNUSED size_t length) {
-  fprintf(stderr, "%c executorch:%s:%zu] %s\n", level, filename, line, message);
+  fprintf(
+      stderr, "%c [executorch:%s:%zu] %s\n", level, filename, line, message);
 }
 
 namespace {
-using namespace torch::executor;
 
-Result<util::BufferCleanup> prepare_input_tensors(
+Result<BufferCleanup> prepare_input_tensors(
     Method& method,
-    torch::executor::MemoryAllocator& allocator,
+    MemoryAllocator& allocator,
     std::vector<std::pair<char*, size_t>>& input_buffers) {
   MethodMeta method_meta = method.method_meta();
   size_t num_inputs = method_meta.num_inputs();
   size_t num_allocated = 0;
 
+#ifdef SEMIHOSTING
   ET_CHECK_OR_RETURN_ERROR(
       input_buffers.size() > 0 && num_inputs == input_buffers.size(),
       InvalidArgument,
       "Wrong number of inputs allocated compared to method");
+#endif
 
   void** inputs =
       static_cast<void**>(allocator.allocate(num_inputs * sizeof(void*)));
@@ -162,18 +187,18 @@ Result<util::BufferCleanup> prepare_input_tensors(
       ET_LOG(
           Error, "Failed to prepare input %zu: 0x%" PRIx32, i, (uint32_t)err);
       // The BufferCleanup will free the inputs when it goes out of scope.
-      util::BufferCleanup cleanup({inputs, num_allocated});
+      BufferCleanup cleanup({inputs, num_allocated});
       return err;
     }
   }
-  return util::BufferCleanup({inputs, num_allocated});
+  return BufferCleanup({inputs, num_allocated});
 }
 
 #ifdef SEMIHOSTING
 
 std::pair<char*, size_t> read_binary_file(
     const char* filename,
-    torch::executor::MemoryAllocator& allocator) {
+    MemoryAllocator& allocator) {
   FILE* fp = fopen(filename, "rb");
   if (!fp) {
     ET_LOG(
@@ -225,13 +250,13 @@ int main(int argc, const char* argv[]) {
   (void)argv;
 #endif
 
-  torch::executor::runtime_init();
+  executorch::runtime::runtime_init();
   std::vector<std::pair<char*, size_t>> input_buffers;
   size_t pte_size = sizeof(model_pte);
 
 #ifdef SEMIHOSTING
   const char* output_basename = nullptr;
-  torch::executor::MemoryAllocator input_allocator(
+  MemoryAllocator input_allocator(
       input_allocation_pool_size, input_allocation_pool);
 
   /* parse input parameters */
@@ -264,10 +289,9 @@ int main(int argc, const char* argv[]) {
   }
 #endif
   ET_LOG(Info, "Model in %p %c", model_pte, model_pte[0]);
-  auto loader = torch::executor::util::BufferDataLoader(model_pte, pte_size);
+  auto loader = BufferDataLoader(model_pte, pte_size);
   ET_LOG(Info, "Model PTE file loaded. Size: %lu bytes.", pte_size);
-  Result<torch::executor::Program> program =
-      torch::executor::Program::load(&loader);
+  Result<Program> program = Program::load(&loader);
   if (!program.ok()) {
     ET_LOG(
         Info,
@@ -286,8 +310,7 @@ int main(int argc, const char* argv[]) {
   }
   ET_LOG(Info, "Running method %s", method_name);
 
-  Result<torch::executor::MethodMeta> method_meta =
-      program->method_meta(method_name);
+  Result<MethodMeta> method_meta = program->method_meta(method_name);
   if (!method_meta.ok()) {
     ET_LOG(
         Info,
@@ -296,13 +319,11 @@ int main(int argc, const char* argv[]) {
         (unsigned int)method_meta.error());
   }
 
-  torch::executor::MemoryAllocator method_allocator{
-      torch::executor::MemoryAllocator(
-          METHOD_ALLOCATOR_POOL_SIZE, method_allocation_pool)};
+  MemoryAllocator method_allocator(
+      METHOD_ALLOCATOR_POOL_SIZE, method_allocation_pool);
 
   std::vector<uint8_t*> planned_buffers; // Owns the memory
-  std::vector<torch::executor::Span<uint8_t>>
-      planned_spans; // Passed to the allocator
+  std::vector<Span<uint8_t>> planned_spans; // Passed to the allocator
   size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
 
   for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
@@ -317,14 +338,16 @@ int main(int argc, const char* argv[]) {
     planned_spans.push_back({planned_buffers.back(), buffer_size});
   }
 
-  torch::executor::HierarchicalAllocator planned_memory(
+  HierarchicalAllocator planned_memory(
       {planned_spans.data(), planned_spans.size()});
 
-  torch::executor::MemoryManager memory_manager(
-      &method_allocator, &planned_memory);
+  MemoryAllocator temp_allocator(
+      temp_allocation_pool_size, temp_allocation_pool);
 
-  Result<torch::executor::Method> method =
-      program->load_method(method_name, &memory_manager);
+  MemoryManager memory_manager(
+      &method_allocator, &planned_memory, &temp_allocator);
+
+  Result<Method> method = program->load_method(method_name, &memory_manager);
   if (!method.ok()) {
     ET_LOG(
         Info,
@@ -349,7 +372,10 @@ int main(int argc, const char* argv[]) {
   ET_LOG(Info, "Input prepared.");
 
   ET_LOG(Info, "Starting the model execution...");
+  StartMeasurements();
   Error status = method->execute();
+  StopMeasurements();
+
   if (status != Error::Ok) {
     ET_LOG(
         Info,
@@ -360,13 +386,15 @@ int main(int argc, const char* argv[]) {
     ET_LOG(Info, "Model executed successfully.");
   }
 
-  std::vector<torch::executor::EValue> outputs(method->outputs_size());
+  std::vector<EValue> outputs(method->outputs_size());
   ET_LOG(Info, "%zu outputs: ", outputs.size());
   status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
   for (int i = 0; i < outputs.size(); ++i) {
     Tensor t = outputs[i].toTensor();
 #ifndef SEMIHOSTING
+    // The output might be collected and parsed so printf() is used instead
+    // of ET_LOG() here
     for (int j = 0; j < outputs[i].toTensor().numel(); ++j) {
       if (t.scalar_type() == ScalarType::Int) {
         printf(
diff --git a/examples/arm/executor_runner/arm_perf_monitor.cpp b/examples/arm/executor_runner/arm_perf_monitor.cpp
new file mode 100644
index 00000000000..c53d28baab4
--- /dev/null
+++ b/examples/arm/executor_runner/arm_perf_monitor.cpp
@@ -0,0 +1,173 @@
+/* Copyright 2024 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cinttypes>
+#include <vector>
+
+#include "arm_perf_monitor.h"
+
+#ifdef ETHOSU
+#include <ethosu_driver.h>
+#include <executorch/runtime/platform/log.h>
+#include <pmu_ethosu.h>
+
+static uint32_t ethosu_inference_count = 0;
+static uint64_t ethosu_ArmBackendExecuteCycleCountStart = 0;
+static uint64_t ethosu_ArmBackendExecuteCycleCount = 0;
+static uint64_t ethosu_ArmWhenNPURunCycleCountStart = 0;
+static uint64_t ethosu_ArmWhenNPURunCycleCount = 0;
+static uint64_t ethosu_pmuCycleCount = 0;
+static std::vector<uint64_t> ethosu_pmuEventCounts(
+    ETHOSU_PMU_Get_NumEventCounters(),
+    0);
+
+static const uint32_t ethosu_pmuCountersUsed = 4;
+// ethosu_pmuCountersUsed should match numbers of counters setup in
+// ethosu_inference_begin() and not be more then the HW supports
+static_assert(ETHOSU_PMU_NCOUNTERS >= ethosu_pmuCountersUsed);
+
+extern "C" {
+
+// Callback invoked at start of NPU execution
+void ethosu_inference_begin(struct ethosu_driver* drv, void*) {
+  // Enable PMU
+  ETHOSU_PMU_Enable(drv);
+  ETHOSU_PMU_PMCCNTR_CFG_Set_Stop_Event(drv, ETHOSU_PMU_NPU_IDLE);
+  ETHOSU_PMU_PMCCNTR_CFG_Set_Start_Event(drv, ETHOSU_PMU_NPU_ACTIVE);
+
+  // Setup 4 counters
+  ETHOSU_PMU_Set_EVTYPER(drv, 0, ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED);
+  ETHOSU_PMU_Set_EVTYPER(drv, 1, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED);
+  ETHOSU_PMU_Set_EVTYPER(drv, 2, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN);
+  ETHOSU_PMU_Set_EVTYPER(drv, 3, ETHOSU_PMU_NPU_IDLE);
+  // Enable 4 counters
+  ETHOSU_PMU_CNTR_Enable(drv, 0xf);
+
+  ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk);
+  ETHOSU_PMU_CYCCNT_Reset(drv);
+
+  // Reset all counters
+  ETHOSU_PMU_EVCNTR_ALL_Reset(drv);
+
+  // Save Cortex-M cycle clock to calculate total CPU cycles used in
+  // ethosu_inference_end()
+  ethosu_ArmWhenNPURunCycleCountStart = ARM_PMU_Get_CCNTR();
+}
+
+// Callback invoked at end of NPU execution
+void ethosu_inference_end(struct ethosu_driver* drv, void*) {
+  ethosu_inference_count++;
+  ethosu_pmuCycleCount += ETHOSU_PMU_Get_CCNTR(drv);
+
+  for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) {
+    ethosu_pmuEventCounts[i] += ETHOSU_PMU_Get_EVCNTR(drv, i);
+  }
+  ETHOSU_PMU_Disable(drv);
+  // Add Cortex-M cycle clock used during this NPU execution
+  ethosu_ArmWhenNPURunCycleCount +=
+      (ARM_PMU_Get_CCNTR() - ethosu_ArmWhenNPURunCycleCountStart);
+}
+
+// Callback invoked at start of ArmBackend::execute()
+void ArmBackend_execute_begin() {
+  // Save Cortex-M cycle clock to calculate total CPU cycles used in
+  // ArmBackend_execute_end()
+  ethosu_ArmBackendExecuteCycleCountStart = ARM_PMU_Get_CCNTR();
+}
+
+// Callback invoked at end of ArmBackend::execute()
+void ArmBackend_execute_end() {
+  // Add Cortex-M cycle clock used during this ArmBackend::execute()
+  ethosu_ArmBackendExecuteCycleCount +=
+      (ARM_PMU_Get_CCNTR() - ethosu_ArmBackendExecuteCycleCountStart);
+}
+}
+
+void StartMeasurements() {
+  ethosu_ArmBackendExecuteCycleCount = 0;
+  ethosu_ArmWhenNPURunCycleCount = 0;
+  ethosu_pmuCycleCount = 0;
+
+  for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) {
+    ethosu_pmuEventCounts[i] = 0;
+  }
+  ARM_PMU_Enable();
+  DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable
+  ARM_PMU_CYCCNT_Reset();
+  ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk);
+}
+
+void StopMeasurements() {
+  ARM_PMU_CNTR_Disable(
+      PMU_CNTENCLR_CCNTR_ENABLE_Msk | PMU_CNTENCLR_CNT0_ENABLE_Msk |
+      PMU_CNTENCLR_CNT1_ENABLE_Msk);
+  uint32_t cycle_count = ARM_PMU_Get_CCNTR();
+
+  // Number of comand streams handled by the NPU
+  ET_LOG(Info, "NPU Inferences : %d", ethosu_inference_count);
+  ET_LOG(Info, "Profiler report, CPU cycles per operator:");
+  // This is number of CPU cycles for the ethos-u operator from start to finish
+  // in the framework If there is more then one commandstream the time is added
+  // together
+  ET_LOG(
+      Info,
+      "ethos-u : cycle_cnt : %d cycles",
+      ethosu_ArmBackendExecuteCycleCount);
+  // We could print a list of the cycles used by the other delegates here in the
+  // future but now we only print ethos-u: this means that "Operator(s) total:
+  // ..." will be the same number as ethos-u : cycle_cnt and not the sum of all
+  ET_LOG(
+      Info,
+      "Operator(s) total: %d CPU cycles",
+      ethosu_ArmBackendExecuteCycleCount);
+  // Total CPU cycles used in the executorch method->execute()
+  // Other delegates and no delegates are counted in this
+  ET_LOG(Info, "Inference runtime: %d CPU cycles total", cycle_count);
+
+  ET_LOG(
+      Info,
+      "NOTE: CPU cycle values and ratio calculations require FPGA and identical CPU/NPU frequency");
+
+  // Avoid division with zero if ARM_PMU_Get_CCNTR() is not enabled properly.
+  if (cycle_count == 0) {
+    ET_LOG(Info, "Inference CPU ratio: ?.?? %%");
+    ET_LOG(Info, "Inference NPU ratio: ?.?? %%");
+  } else {
+    ET_LOG(
+        Info,
+        "Inference CPU ratio: %.2f %%",
+        100.0 * (cycle_count - ethosu_ArmWhenNPURunCycleCount) / cycle_count);
+    ET_LOG(
+        Info,
+        "Inference NPU ratio: %.2f %%",
+        100.0 * ethosu_ArmWhenNPURunCycleCount / cycle_count);
+  }
+
+  // CPU cycles used by NPU, e.g. number of CPU cycles used between
+  // ethosu_inference_begin() and ethosu_inference_end()
+  // If there is more then one commandstream the time is added together
+  ET_LOG(
+      Info,
+      "cpu_wait_for_npu_cntr : %" PRIu64 " CPU cycles",
+      ethosu_ArmWhenNPURunCycleCount);
+
+  ET_LOG(Info, "Ethos-U PMU report:");
+  ET_LOG(Info, "ethosu_pmu_cycle_cntr : %" PRIu64, ethosu_pmuCycleCount);
+
+  for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) {
+    ET_LOG(Info, "ethosu_pmu_cntr%zd : %" PRIu64, i, ethosu_pmuEventCounts[i]);
+  }
+  ET_LOG(
+      Info,
+      "Ethos-U PMU Events:[ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN, ETHOSU_PMU_NPU_IDLE]");
+}
+
+#else
+void StartMeasurements() {}
+
+void StopMeasurements() {}
+
+#endif
diff --git a/examples/arm/executor_runner/arm_perf_monitor.h b/examples/arm/executor_runner/arm_perf_monitor.h
new file mode 100644
index 00000000000..3925a9a5713
--- /dev/null
+++ b/examples/arm/executor_runner/arm_perf_monitor.h
@@ -0,0 +1,10 @@
+/* Copyright 2024 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+void StartMeasurements();
+void StopMeasurements();
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index f41e0ef50c6..4a3f6dbf672 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -147,6 +147,10 @@ function build_executorch_runner() {
     cmake --build cmake-out -- -j"$((n - 5))" arm_executor_runner
     echo "[${FUNCNAME[0]}] Generated baremetal elf file:"
     find cmake-out -name "arm_executor_runner"
+    echo "executable_text: $(find -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $1}') bytes"
+    echo "executable_data: $(find -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $2}') bytes"
+    echo "executable_bss:  $(find -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $3}') bytes"
+    echo "pte_data_size:   $(stat -c%s ${pte}) bytes"
 }
 
 # Execute the executor_runner on FVP Simulator
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index 272ddcfc0c5..3d99143d27b 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -91,6 +91,7 @@ fi
 ### Optional user args
 ########
 root_dir=${1:-"${script_dir}/ethos-u-scratch"}
+mkdir -p ${root_dir}
 root_dir=$(realpath ${root_dir})
 
 ########
@@ -215,7 +216,7 @@ function setup_vela() {
     if [[ ! -e ethos-u-vela ]]; then
         git clone https://review.mlplatform.org/ml/ethos-u/ethos-u-vela
         repo_dir="${root_dir}/ethos-u-vela"
-        base_rev=7706c1281166e7611f4300ed26338087152a33c9
+        base_rev=fe0eaa55c5ed319f78c01978f3b40eb11a9bcb38
         patch_repo
     fi
     cd "${root_dir}/ethos-u-vela"
@@ -246,7 +247,6 @@ fi
 cd "${script_dir}"
 
 # Setup the root dir
-mkdir -p "${root_dir}"
 cd "${root_dir}"
 echo "[main] Using root dir ${root_dir}"
 
diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md
index 89d8c34ee39..a60307dd90f 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/README.md
+++ b/examples/demo-apps/android/ExecuTorchDemo/README.md
@@ -53,7 +53,7 @@ For delegating to Qualcomm Hexagon NPU, please follow the tutorial [here](build-
 After generating the model, copy the model to `assets` directory.
 
 ```bash
-python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8450 -s <adb_connected_device_serial>
+python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8450 -s <adb_connected_device_serial>
 cp deeplab_v3/dlv3_qnn.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
 ```
 
@@ -78,6 +78,8 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -Bcmake-android-out
 
 cmake --build cmake-android-out -j16 --target install
@@ -119,6 +121,8 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -Bcmake-android-out
 
 cmake --build cmake-android-out -j16 --target install
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK
new file mode 100644
index 00000000000..2b33cef732a
--- /dev/null
+++ b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK
@@ -0,0 +1,67 @@
+load("@fbsource//tools/build_defs:manifold.bzl", "manifold_get")
+load("@fbsource//tools/build_defs/android:fb_android_binary.bzl", "fb_android_binary")
+load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library")
+load("@fbsource//tools/build_defs/android:fb_android_resource.bzl", "fb_android_resource")
+
+manifold_get(
+    name = "dl3_xnnpack_fp32",
+    out = "dl3_xnnpack_fp32.pte",
+    api_key = "executorch-key",
+    artifact_path = "tree/models/benchmarking/executorch/dl3_xnnpack_fp32.pte",
+    bucket_name = "executorch",
+    sha1 = "3e7af1d8f5ec4acb6de156d361715e16e5f53783",
+    timeout_msec = 120000,
+)
+
+fb_android_resource(
+    name = "app_res",
+    assets = "assets",
+    package = "com.example.executorchdemo",
+    res = "res",
+)
+
+fb_android_resource(
+    name = "model_res",
+    assets = {"dl3_xnnpack_fp32.pte": ":dl3_xnnpack_fp32"},
+    package = "com.example.executorchdemo",
+    res = "res",
+)
+
+fb_android_library(
+    name = "app_lib",
+    srcs = [
+        "java/com/example/executorchdemo/ClassificationActivity.java",
+        "java/com/example/executorchdemo/ImageNetClasses.java",
+        "java/com/example/executorchdemo/MainActivity.java",
+        "java/com/example/executorchdemo/TensorImageUtils.java",
+    ],
+    autoglob = False,
+    language = "JAVA",
+    deps = [
+        ":app_res",
+        "//xplat/executorch/extension/android:executorch",
+    ],
+)
+
+fb_android_binary(
+    name = "ExecuTorchDemo",
+    keystore = "//fbandroid/keystores:debug",
+    manifest = "AndroidManifest.xml",
+    manifest_entries = {
+        "min_sdk_version": 19,  # Android supports 19 for minimum
+        "target_sdk_version": 34,
+        "version_code": "1",
+        "version_name": "1.0",
+    },
+    package_type = "release",
+    skip_proguard = True,
+    deps = [
+        ":app_lib",
+        ":app_res",
+        ":model_res",
+        "//third-party/java/androidx/appcompat/appcompat:appcompat",
+        "//third-party/java/androidx/constraintlayout/constraintlayout:constraintlayout",
+        "//xplat/executorch/extension/android:executorch",
+        "//xplat/executorch/extension/android/jni:executorch_jni_full",
+    ],
+)
diff --git a/examples/demo-apps/android/ExecuTorchDemo/setup.sh b/examples/demo-apps/android/ExecuTorchDemo/setup.sh
index 05dc3e4492e..00d9201b092 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/setup.sh
+++ b/examples/demo-apps/android/ExecuTorchDemo/setup.sh
@@ -15,6 +15,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TESNOR=ON \
   -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
   -DCMAKE_BUILD_TYPE=Release \
   -B"${CMAKE_OUT}"
diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md
index 7bb36657da3..cfb66538269 100644
--- a/examples/demo-apps/android/LlamaDemo/README.md
+++ b/examples/demo-apps/android/LlamaDemo/README.md
@@ -1,111 +1,139 @@
-# Building ExecuTorch LLaMA Android Demo App
-
-This app demonstrates the use of the LLaMA chat app demonstrating local inference use case with ExecuTorch.
-
-## Prerequisites
-* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment.
-* Install [Java 17 JDK](https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html).
-* Install the [Android SDK API Level 34](https://developer.android.com/about/versions/14/setup-sdk) and
-  [Android NDK 25.0.8775105](https://developer.android.com/studio/projects/install-ndk).
- * If you have Android Studio set up, you can install them with
-   * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Platforms -> Check the row with API Level 34.
-   * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Tools -> Check NDK (Side by side) row.
- * Alternatively, you can follow [this guide](https://github.com/pytorch/executorch/blob/856e085b9344c8b0bf220a97976140a5b76356aa/examples/demo-apps/android/LlamaDemo/SDK.md) to set up Java/SDK/NDK with CLI.
-* Supported Host OS: CentOS, macOS Sonoma on Apple Silicon.
-
-Note: This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis), with NDK 25.0.8775105.
-
-## Getting models
-Please refer to the [ExecuTorch Llama2 docs](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) to export the model.
-
-After you export the model and generate tokenizer.bin, push them device:
-```bash
-adb shell mkdir -p /data/local/tmp/llama
-adb push llama2.pte /data/local/tmp/llama
-adb push tokenizer.bin /data/local/tmp/llama
-```
+# ExecuTorch Llama Android Demo App
 
-Note: The demo app searches in `/data/local/tmp/llama` for .pte and .bin files as LLAMA model and tokenizer.
+We’re excited to share that the newly revamped Android demo app is live and includes many new updates to provide a more intuitive and smoother user experience with a chat use case! The primary goal of this app is to showcase how easily ExecuTorch can be integrated into an Android demo app and how to exercise the many features ExecuTorch and Llama models have to offer.
 
-## Build library
-For the demo app to build, we need to build the ExecuTorch AAR library first.
+This app serves as a valuable resource to inspire your creativity and provide foundational code that you can customize and adapt for your particular use case.
 
-The AAR library contains the required Java package and the corresponding JNI
-library for using ExecuTorch in your Android app.
+Please dive in and start exploring our demo app today! We look forward to any feedback and are excited to see your innovative ideas.
 
-### Alternative 1: Use prebuilt AAR library (recommended)
 
-1. Open a terminal window and navigate to the root directory of the `executorch`.
-2. Run the following command to download the prebuilt library:
-```bash
-bash examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh
-```
+## Key Concepts
+From this demo app, you will learn many key concepts such as:
+* How to prepare Llama models, build the ExecuTorch library, and model inferencing across delegates
+* Expose the ExecuTorch library via JNI layer
+* Familiarity with current ExecuTorch app-facing capabilities
 
-The prebuilt AAR library contains the Java library and the JNI binding for
-NativePeer.java and ExecuTorch native library, including core ExecuTorch
-runtime libraries, XNNPACK backend, Portable kernels, Optimized kernels,
-and Quantized kernels. It comes with two ABI variants, arm64-v8a and x86_64.
+The goal is for you to see the type of support ExecuTorch provides and feel comfortable with leveraging it for your use cases.
 
-If you want to use the prebuilt library for your own app, please refer to
-[Using Android prebuilt libraries (AAR)](./android-prebuilt-library.md) for
-tutorial.
+## Supporting Models
+As a whole, the models that this app supports are (varies by delegate):
+* Llama 3.1 8B
+* Llama 3 8B
+* Llama 2 7B
+* LLaVA-1.5 vision model (only XNNPACK)
 
-If you need to use other dependencies (like tokenizer), please refer to
-Alternative 2: Build from local machine option.
 
-### Alternative 2: Build from local machine
-1. Open a terminal window and navigate to the root directory of the `executorch`.
-2. Set the following environment variables:
-```bash
-export ANDROID_NDK=<path_to_android_ndk>
-export ANDROID_ABI=arm64-v8a
-```
-Note: `<path_to_android_ndk>` is the root for the NDK, which is usually under
-`~/Library/Android/sdk/ndk/XX.Y.ZZZZZ` for macOS, and contains NOTICE and README.md.
-We use `<path_to_android_ndk>/build/cmake/android.toolchain.cmake` for CMake to cross-compile.
+## Building the APK
+First it’s important to note that currently ExecuTorch provides support across 3 delegates. Once you identify the delegate of your choice, select the README link to get a complete end-to-end instructions for environment set-up to exporting the models to build ExecuTorch libraries and apps to run on device:
 
-3. (Optional) If you need to use tiktoken as the tokenizer (for LLaMA3), set
-`EXECUTORCH_USE_TIKTOKEN=ON` and later CMake will use it as the tokenizer.
-If you need to run other models like LLaMA2, skip this skip.
+| Delegate      | Resource |
+| ------------- | ------------- |
+| XNNPACK (CPU-based library)  | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md) |
+| QNN (Qualcomm AI Accelerators)  | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md) |
+| MediaTek (MediaTek AI Accelerators)  | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md)  |
 
-```bash
-export EXECUTORCH_USE_TIKTOKEN=ON # Only for LLaMA3
-```
+## How to Use the App
 
-4. Build the Android Java extension code:
-```bash
-pushd extension/android
-./gradlew build
-popd
-```
+This section will provide the main steps to use the app, along with a code snippet of the ExecuTorch API.
 
-5. Run the following command set up the required JNI library:
-```bash
-pushd examples/demo-apps/android/LlamaDemo
-./gradlew :app:setup
-popd
-```
-This is running the shell script [setup.sh](./setup.sh) which configures the required core ExecuTorch, LLAMA2, and Android libraries, builds them, and copy to jniLibs.
-
-## Build APK
-### Alternative 1: Android Studio (Recommended)
+For loading the app, development, and running on device we recommend Android Studio:
 1. Open Android Studio and select "Open an existing Android Studio project" to open examples/demo-apps/android/LlamaDemo.
 2. Run the app (^R). This builds and launches the app on the phone.
 
-### Alternative 2: Command line
-Without Android Studio UI, we can run gradle directly to build the app. We need to set up the Android SDK path and invoke gradle.
-```bash
-export ANDROID_HOME=<path_to_android_sdk_home>
-pushd examples/demo-apps/android/LlamaDemo
-./gradlew :app:installDebug
-popd
+### Opening the App
+
+Below are the UI features for the app.
+
+Select the settings widget to get started with picking a model, its parameters and any prompts.
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/opening_the_app_details.png" style="width:800px">
+</p>
+
+
+
+### Select Models and Parameters
+
+Once you've selected the model, tokenizer, and model type you are ready to click on "Load Model" to have the app load the model and go back to the main Chat activity.
+<p align="center">
+      <img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/settings_menu.png" style="width:300px">
+</p>
+
+
+
+Optional Parameters:
+* Temperature: Defaulted to 0, you can adjust the temperature for the model as well. The model will reload upon any adjustments.
+* System Prompt: Without any formatting, you can enter in a system prompt. For example, "you are a travel assistant" or "give me a response in a few sentences".
+* User Prompt: More for the advanced user, if you would like to manually input a prompt then you can do so by modifying the `{{user prompt}}`. You can also modify the special tokens as well. Once changed then go back to the main Chat activity to send.
+
+#### ExecuTorch App API
+
+```java
+// Upon returning to the Main Chat Activity
+mModule = new LlamaModule(
+            ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()),
+            modelPath,
+            tokenizerPath,
+            temperature);
+int loadResult = mModule.load();
+```
+
+* `modelCategory`: Indicate whether it’s a text-only or vision model
+* `modePath`: path to the .pte file
+* `tokenizerPath`: path to the tokenizer .bin file
+* `temperature`: model parameter to adjust the randomness of the model’s output
+
+
+### User Prompt
+Once model is successfully loaded then enter any prompt and click the send (i.e. generate) button to send it to the model.
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/load_complete_and_start_prompt.png" style="width:300px">
+</p>
+
+You can provide it more follow-up questions as well.
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/chat.png" style="width:300px">
+</p>
+
+#### ExecuTorch App API
+
+```java
+mModule.generate(prompt,sequence_length, MainActivity.this);
 ```
+* `prompt`: User formatted prompt
+* `sequence_length`: Number of tokens to generate in response to a prompt
+* `MainActivity.this`: Indicate that the callback functions (OnResult(), OnStats()) are present in this class.
 
-On the phone or emulator, you can try running the model:
-<img src="../_static/img/android_llama_app.png" alt="Android LLaMA App" /><br>
+[*LLaVA-1.5: Only for XNNPACK delegate*]
 
-## Takeaways
-Through this tutorial we've learnt how to build the ExecuTorch LLAMA library, and expose it to JNI layer to build the Android app.
+For LLaVA-1.5 implementation, select the exported LLaVA .pte and tokenizer file in the Settings menu and load the model. After this you can send an image from your gallery or take a live picture along with a text prompt to the model.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/llava_example.png" style="width:300px">
+</p>
+
+
+### Output Generated
+To show completion of the follow-up question, here is the complete detailed response from the model.
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/chat_response.png" style="width:300px">
+</p>
+
+#### ExecuTorch App API
+
+Ensure you have the following functions in your callback class that you provided in the `mModule.generate()`. For this example, it is `MainActivity.this`.
+```java
+  @Override
+  public void onResult(String result) {
+    //...result contains token from response
+    //.. onResult will continue to be invoked until response is complete
+  }
+
+  @Override
+  public void onStats(float tps) {
+    //...tps (tokens per second) stats is provided by framework
+  }
+
+```
 
 ## Reporting Issues
 If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK
new file mode 100644
index 00000000000..80315c4104b
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK
@@ -0,0 +1,65 @@
+load("@fbsource//tools/build_defs/android:fb_android_binary.bzl", "fb_android_binary")
+load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library")
+load("@fbsource//tools/build_defs/android:fb_android_resource.bzl", "fb_android_resource")
+
+oncall("executorch")
+
+fb_android_resource(
+    name = "app_res",
+    package = "com.example.executorchllamademo",
+    res = "res",
+)
+
+fb_android_library(
+    name = "app_lib",
+    srcs = [
+        "java/com/example/executorchllamademo/AppLog.java",
+        "java/com/example/executorchllamademo/DemoSharedPreferences.java",
+        "java/com/example/executorchllamademo/ETImage.java",
+        "java/com/example/executorchllamademo/ETLogging.java",
+        "java/com/example/executorchllamademo/LlmBenchmarkRunner.java",
+        "java/com/example/executorchllamademo/LogsActivity.java",
+        "java/com/example/executorchllamademo/LogsAdapter.java",
+        "java/com/example/executorchllamademo/MainActivity.java",
+        "java/com/example/executorchllamademo/Message.java",
+        "java/com/example/executorchllamademo/MessageAdapter.java",
+        "java/com/example/executorchllamademo/MessageType.java",
+        "java/com/example/executorchllamademo/ModelRunner.java",
+        "java/com/example/executorchllamademo/ModelRunnerCallback.java",
+        "java/com/example/executorchllamademo/ModelType.java",
+        "java/com/example/executorchllamademo/ModelUtils.java",
+        "java/com/example/executorchllamademo/PromptFormat.java",
+        "java/com/example/executorchllamademo/SettingsActivity.java",
+        "java/com/example/executorchllamademo/SettingsFields.java",
+    ],
+    autoglob = False,
+    language = "JAVA",
+    deps = [
+        ":app_res",
+        "//third-party/java/androidx/constraintlayout/constraintlayout:constraintlayout",
+        "//third-party/java/com/google/code/gson/gson:gson",
+        "//xplat/executorch/extension/android:executorch_llama",
+    ],
+)
+
+fb_android_binary(
+    name = "ExecuTorchLlamaDemo",
+    keystore = "//fbandroid/keystores:debug",
+    manifest = "AndroidManifest.xml",
+    manifest_entries = {
+        "min_sdk_version": 21,
+        "target_sdk_version": 34,
+        "version_code": "1",
+        "version_name": "1.0",
+    },
+    package_type = "release",
+    skip_proguard = True,
+    deps = [
+        ":app_lib",
+        ":app_res",
+        "//third-party/java/androidx/appcompat/appcompat:appcompat",
+        "//third-party/java/com/google/code/gson/gson:gson",
+        "//xplat/executorch/extension/android:executorch_llama",
+        "//xplat/executorch/extension/android/jni:executorch_llama_jni",
+    ],
+)
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java
index cf3c3e5f0a5..e68c8472626 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java
@@ -46,6 +46,16 @@ public byte[] getBytes() {
     return bytes;
   }
 
+  public int[] getInts() {
+    // We need to convert the byte array to an int array because
+    // the runner expects an int array as input.
+    int[] intArray = new int[bytes.length];
+    for (int i = 0; i < bytes.length; i++) {
+      intArray[i] = (bytes[i++] & 0xFF);
+    }
+    return intArray;
+  }
+
   private byte[] getBytesFromImageURI(Uri uri) {
     try {
       int RESIZED_IMAGE_WIDTH = 336;
@@ -72,9 +82,9 @@ private byte[] getBytesFromImageURI(Uri uri) {
           int blue = Color.blue(color);
 
           // Store the RGB values in the byte array
-          rgbValues[(y * width + x) * 3] = (byte) red;
-          rgbValues[(y * width + x) * 3 + 1] = (byte) green;
-          rgbValues[(y * width + x) * 3 + 2] = (byte) blue;
+          rgbValues[y * width + x] = (byte) red;
+          rgbValues[(y * width + x) + height * width] = (byte) green;
+          rgbValues[(y * width + x) + 2 * height * width] = (byte) blue;
         }
       }
       return rgbValues;
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
index 33b230b1dff..7236fe317b0 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
@@ -9,13 +9,22 @@
 package com.example.executorchllamademo;
 
 import android.app.Activity;
+import android.app.ActivityManager;
 import android.content.Intent;
+import android.os.Build;
 import android.os.Bundle;
 import android.util.Log;
 import android.widget.TextView;
 import androidx.annotation.NonNull;
+import com.google.gson.Gson;
+import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 public class LlmBenchmarkRunner extends Activity implements ModelRunnerCallback {
   ModelRunner mModelRunner;
@@ -32,7 +41,12 @@ protected void onCreate(Bundle savedInstanceState) {
 
     Intent intent = getIntent();
 
-    String modelPath = intent.getStringExtra("model_path");
+    File modelDir = new File(intent.getStringExtra("model_dir"));
+    File model =
+        Arrays.stream(modelDir.listFiles())
+            .filter(file -> file.getName().endsWith(".pte"))
+            .findFirst()
+            .get();
     String tokenizerPath = intent.getStringExtra("tokenizer_path");
 
     float temperature = intent.getFloatExtra("temperature", 0.8f);
@@ -42,19 +56,21 @@ protected void onCreate(Bundle savedInstanceState) {
     }
 
     mStatsDump = new StatsDump();
-    mModelRunner = new ModelRunner(modelPath, tokenizerPath, temperature, this);
-    mStatsDump.loadStart = System.currentTimeMillis();
+    mStatsDump.modelName = model.getName().replace(".pte", "");
+    mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this);
+    mStatsDump.loadStart = System.nanoTime();
   }
 
   @Override
   public void onModelLoaded(int status) {
-    mStatsDump.loadEnd = System.currentTimeMillis();
+    mStatsDump.loadEnd = System.nanoTime();
+    mStatsDump.loadStatus = status;
     if (status != 0) {
       Log.e("LlmBenchmarkRunner", "Loaded failed: " + status);
       onGenerationStopped();
       return;
     }
-    mStatsDump.generateStart = System.currentTimeMillis();
+    mStatsDump.generateStart = System.nanoTime();
     mModelRunner.generate(mPrompt);
   }
 
@@ -73,26 +89,122 @@ public void onStats(String stats) {
 
   @Override
   public void onGenerationStopped() {
-    mStatsDump.generateEnd = System.currentTimeMillis();
+    mStatsDump.generateEnd = System.nanoTime();
     runOnUiThread(
         () -> {
           mTextView.append(mStatsDump.toString());
         });
 
-    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) {
-      writer.write(mStatsDump.toString());
+    final BenchmarkMetric.BenchmarkModel benchmarkModel =
+        BenchmarkMetric.extractBackendAndQuantization(mStatsDump.modelName);
+    final List<BenchmarkMetric> results = new ArrayList<>();
+    // The list of metrics we have atm includes:
+    // Load status
+    results.add(new BenchmarkMetric(benchmarkModel, "load_status", mStatsDump.loadStatus, 0));
+    // Model load time
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel,
+            "model_load_time(ms)",
+            (mStatsDump.loadEnd - mStatsDump.loadStart) * 1e-6,
+            0.0f));
+    // LLM generate time
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel,
+            "generate_time(ms)",
+            (mStatsDump.generateEnd - mStatsDump.generateStart) * 1e-6,
+            0.0f));
+    // Token per second
+    results.add(
+        new BenchmarkMetric(benchmarkModel, "token_per_sec", extractTPS(mStatsDump.tokens), 0.0f));
+
+    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
+      Gson gson = new Gson();
+      writer.write(gson.toJson(results));
     } catch (IOException e) {
       e.printStackTrace();
     }
   }
+
+  private double extractTPS(final String tokens) {
+    final Matcher m = Pattern.compile("\\d+\\.?\\d*").matcher(tokens);
+    if (m.find()) {
+      return Double.parseDouble(m.group());
+    } else {
+      return 0.0f;
+    }
+  }
+}
+
+class BenchmarkMetric {
+  public static class BenchmarkModel {
+    // The model name, i.e. stories110M
+    String name;
+    String backend;
+    String quantization;
+
+    public BenchmarkModel(final String name, final String backend, final String quantization) {
+      this.name = name;
+      this.backend = backend;
+      this.quantization = quantization;
+    }
+  }
+
+  BenchmarkModel benchmarkModel;
+
+  // The metric name, i.e. TPS
+  String metric;
+
+  // The actual value and the option target value
+  double actualValue;
+  double targetValue;
+
+  public static class DeviceInfo {
+    // Let's see which information we want to include here
+    final String device = Build.BRAND;
+    // The phone model and Android release version
+    final String arch = Build.MODEL;
+    final String os = "Android " + Build.VERSION.RELEASE;
+    final long totalMem = new ActivityManager.MemoryInfo().totalMem;
+    final long availMem = new ActivityManager.MemoryInfo().availMem;
+  }
+
+  DeviceInfo deviceInfo = new DeviceInfo();
+
+  public BenchmarkMetric(
+      final BenchmarkModel benchmarkModel,
+      final String metric,
+      final double actualValue,
+      final double targetValue) {
+    this.benchmarkModel = benchmarkModel;
+    this.metric = metric;
+    this.actualValue = actualValue;
+    this.targetValue = targetValue;
+  }
+
+  // TODO (huydhn): Figure out a way to extract the backend and quantization information from
+  // the .pte model itself instead of parsing its name
+  public static BenchmarkMetric.BenchmarkModel extractBackendAndQuantization(final String model) {
+    final Matcher m =
+        Pattern.compile("(?<name>\\w+)_(?<backend>\\w+)_(?<quantization>\\w+)").matcher(model);
+    if (m.matches()) {
+      return new BenchmarkMetric.BenchmarkModel(
+          m.group("name"), m.group("backend"), m.group("quantization"));
+    } else {
+      return new BenchmarkMetric.BenchmarkModel(model, "", "");
+    }
+  }
 }
 
 class StatsDump {
+  int loadStatus;
   long loadStart;
   long loadEnd;
   long generateStart;
   long generateEnd;
   String tokens;
+  String modelName;
 
   @NonNull
   @Override
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java
index 8700528d44a..7777b275e6e 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java
@@ -10,10 +10,12 @@
 
 import android.app.AlertDialog;
 import android.content.DialogInterface;
+import android.os.Build;
 import android.os.Bundle;
 import android.widget.ImageButton;
 import android.widget.ListView;
 import androidx.appcompat.app.AppCompatActivity;
+import androidx.core.content.ContextCompat;
 import androidx.core.graphics.Insets;
 import androidx.core.view.ViewCompat;
 import androidx.core.view.WindowInsetsCompat;
@@ -26,6 +28,10 @@ public class LogsActivity extends AppCompatActivity {
   protected void onCreate(Bundle savedInstanceState) {
     super.onCreate(savedInstanceState);
     setContentView(R.layout.activity_logs);
+    if (Build.VERSION.SDK_INT >= 21) {
+      getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar));
+      getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar));
+    }
     ViewCompat.setOnApplyWindowInsetsListener(
         requireViewById(R.id.main),
         (v, insets) -> {
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index 70936e17d84..524b4fbc8a8 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -16,9 +16,11 @@
 import android.content.Intent;
 import android.content.pm.PackageManager;
 import android.net.Uri;
+import android.os.Build;
 import android.os.Bundle;
 import android.os.Handler;
 import android.os.Looper;
+import android.os.Process;
 import android.provider.MediaStore;
 import android.system.ErrnoException;
 import android.system.Os;
@@ -44,6 +46,8 @@
 import java.lang.reflect.Type;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.concurrent.Executor;
+import java.util.concurrent.Executors;
 import org.pytorch.executorch.LlamaCallback;
 import org.pytorch.executorch.LlamaModule;
 
@@ -70,11 +74,25 @@ public class MainActivity extends AppCompatActivity implements Runnable, LlamaCa
   private SettingsFields mCurrentSettingsFields;
   private Handler mMemoryUpdateHandler;
   private Runnable memoryUpdater;
+  private int promptID = 0;
+  private long startPos = 0;
+  private static final int CONVERSATION_HISTORY_MESSAGE_LOOKBACK = 2;
+  private Executor executor;
 
   @Override
   public void onResult(String result) {
-    mResultMessage.appendText(result);
-    run();
+    if (result.equals(PromptFormat.getStopToken(mCurrentSettingsFields.getModelType()))) {
+      return;
+    }
+    if (result.equals("\n\n") || result.equals("\n")) {
+      if (!mResultMessage.getText().isEmpty()) {
+        mResultMessage.appendText(result);
+        run();
+      }
+    } else {
+      mResultMessage.appendText(result);
+      run();
+    }
   }
 
   @Override
@@ -102,7 +120,12 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
           mMessageAdapter.notifyDataSetChanged();
         });
     long runStartTime = System.currentTimeMillis();
-    mModule = new LlamaModule(modelPath, tokenizerPath, temperature);
+    mModule =
+        new LlamaModule(
+            ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()),
+            modelPath,
+            tokenizerPath,
+            temperature);
     int loadResult = mModule.load();
     long loadDuration = System.currentTimeMillis() - runStartTime;
     String modelLoadError = "";
@@ -132,6 +155,12 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
               + (float) loadDuration / 1000
               + " sec."
               + " You can send text or image for inference";
+
+      if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
+        ETLogging.getInstance().log("Llava start prefill prompt");
+        startPos = mModule.prefillPrompt(PromptFormat.getLlavaPresetPrompt(), 0, 1, 0);
+        ETLogging.getInstance().log("Llava completes prefill prompt");
+      }
     }
 
     Message modelLoadedMessage = new Message(modelInfo, false, MessageType.SYSTEM, 0);
@@ -180,11 +209,21 @@ private void populateExistingMessages(String existingMsgJSON) {
     mMessageAdapter.notifyDataSetChanged();
   }
 
+  private int setPromptID() {
+
+    return mMessageAdapter.getMaxPromptID() + 1;
+  }
+
   @Override
   protected void onCreate(Bundle savedInstanceState) {
     super.onCreate(savedInstanceState);
     setContentView(R.layout.activity_main);
 
+    if (Build.VERSION.SDK_INT >= 21) {
+      getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar));
+      getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar));
+    }
+
     try {
       Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
     } catch (ErrnoException e) {
@@ -201,6 +240,7 @@ protected void onCreate(Bundle savedInstanceState) {
     String existingMsgJSON = mDemoSharedPreferences.getSavedMessages();
     if (!existingMsgJSON.isEmpty()) {
       populateExistingMessages(existingMsgJSON);
+      promptID = setPromptID();
     }
     mSettingsButton = requireViewById(R.id.settings);
     mSettingsButton.setOnClickListener(
@@ -217,6 +257,7 @@ protected void onCreate(Bundle savedInstanceState) {
     setupCameraRoll();
     startMemoryUpdate();
     setupShowLogsButton();
+    executor = Executors.newSingleThreadExecutor();
   }
 
   @Override
@@ -522,6 +563,32 @@ private void showMediaPreview(List<Uri> uris) {
       imageViews.get(i).setVisibility(View.VISIBLE);
       imageViews.get(i).setImageURI(mSelectedImageUri.get(i));
     }
+
+    // For LLava, we want to call prefill_image as soon as an image is selected
+    // Llava only support 1 image for now
+    if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
+      List<ETImage> processedImageList = getProcessedImagesForModel(mSelectedImageUri);
+      if (!processedImageList.isEmpty()) {
+        mMessageAdapter.add(
+            new Message("Llava - Starting image Prefill.", false, MessageType.SYSTEM, 0));
+        mMessageAdapter.notifyDataSetChanged();
+        Runnable runnable =
+            () -> {
+              Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE);
+              ETLogging.getInstance().log("Starting runnable prefill image");
+              ETImage img = processedImageList.get(0);
+              ETLogging.getInstance().log("Llava start prefill image");
+              startPos =
+                  mModule.prefillImages(
+                      img.getInts(),
+                      img.getWidth(),
+                      img.getHeight(),
+                      ModelUtils.VISION_MODEL_IMAGE_CHANNELS,
+                      startPos);
+            };
+        executor.execute(runnable);
+      }
+    }
   }
 
   private void addSelectedImagesToChatThread(List<Uri> selectedImageUri) {
@@ -537,6 +604,48 @@ private void addSelectedImagesToChatThread(List<Uri> selectedImageUri) {
     mMessageAdapter.notifyDataSetChanged();
   }
 
+  private String getConversationHistory() {
+    String conversationHistory = "";
+
+    ArrayList<Message> conversations =
+        mMessageAdapter.getRecentSavedTextMessages(CONVERSATION_HISTORY_MESSAGE_LOOKBACK);
+    if (conversations.isEmpty()) {
+      return conversationHistory;
+    }
+
+    int prevPromptID = conversations.get(0).getPromptID();
+    String conversationFormat =
+        PromptFormat.getConversationFormat(mCurrentSettingsFields.getModelType());
+    String format = conversationFormat;
+    for (int i = 0; i < conversations.size(); i++) {
+      Message conversation = conversations.get(i);
+      int currentPromptID = conversation.getPromptID();
+      if (currentPromptID != prevPromptID) {
+        conversationHistory = conversationHistory + format;
+        format = conversationFormat;
+        prevPromptID = currentPromptID;
+      }
+      if (conversation.getIsSent()) {
+        format = format.replace(PromptFormat.USER_PLACEHOLDER, conversation.getText());
+      } else {
+        format = format.replace(PromptFormat.ASSISTANT_PLACEHOLDER, conversation.getText());
+      }
+    }
+    conversationHistory = conversationHistory + format;
+
+    return conversationHistory;
+  }
+
+  private String getTotalFormattedPrompt(String conversationHistory, String rawPrompt) {
+    if (conversationHistory.isEmpty()) {
+      return mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt);
+    }
+
+    return mCurrentSettingsFields.getFormattedSystemPrompt()
+        + conversationHistory
+        + mCurrentSettingsFields.getFormattedUserPrompt(rawPrompt);
+  }
+
   private void onModelRunStarted() {
     mSendButton.setClickable(false);
     mSendButton.setImageResource(R.drawable.baseline_stop_24);
@@ -552,44 +661,33 @@ private void onModelRunStopped() {
     mSendButton.setOnClickListener(
         view -> {
           addSelectedImagesToChatThread(mSelectedImageUri);
-          // TODO: When ET supports multimodal, this is where we will add the images as part of the
-          // prompt.
-          List<ETImage> processedImageList = getProcessedImagesForModel(mSelectedImageUri);
-          processedImageList.forEach(
-              image -> {
-                ETLogging.getInstance()
-                    .log(
-                        "Image preprocessed:"
-                            + " uri = "
-                            + image.getUri().getLastPathSegment()
-                            + ","
-                            + " width = "
-                            + image.getWidth()
-                            + ","
-                            + " height = "
-                            + image.getHeight()
-                            + ","
-                            + " bytes size = "
-                            + image.getBytes().length);
-              });
+          String finalPrompt;
           String rawPrompt = mEditTextMessage.getText().toString();
-          String prompt = mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt);
+          if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType())
+              == ModelUtils.VISION_MODEL) {
+            finalPrompt = mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt);
+          } else {
+            finalPrompt = getTotalFormattedPrompt(getConversationHistory(), rawPrompt);
+          }
           // We store raw prompt into message adapter, because we don't want to show the extra
           // tokens from system prompt
-          mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, 0));
+          mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, promptID));
           mMessageAdapter.notifyDataSetChanged();
           mEditTextMessage.setText("");
-          mResultMessage = new Message("", false, MessageType.TEXT, 0);
+          mResultMessage = new Message("", false, MessageType.TEXT, promptID);
           mMessageAdapter.add(mResultMessage);
           // Scroll to bottom of the list
           mMessagesView.smoothScrollToPosition(mMessageAdapter.getCount() - 1);
           // After images are added to prompt and chat thread, we clear the imageURI list
           // Note: This has to be done after imageURIs are no longer needed by LlamaModule
           mSelectedImageUri = null;
+          promptID++;
           Runnable runnable =
               new Runnable() {
                 @Override
                 public void run() {
+                  Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE);
+                  ETLogging.getInstance().log("starting runnable generate()");
                   runOnUiThread(
                       new Runnable() {
                         @Override
@@ -597,9 +695,34 @@ public void run() {
                           onModelRunStarted();
                         }
                       });
-                  ETLogging.getInstance().log("Running inference.. prompt=" + prompt);
                   long generateStartTime = System.currentTimeMillis();
-                  mModule.generate(prompt, MainActivity.this);
+                  if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType())
+                      == ModelUtils.VISION_MODEL) {
+                    mModule.generateFromPos(
+                        finalPrompt,
+                        ModelUtils.VISION_MODEL_SEQ_LEN,
+                        startPos,
+                        MainActivity.this,
+                        false);
+                  } else if (mCurrentSettingsFields.getModelType() == ModelType.LLAMA_GUARD_3) {
+                    String llamaGuardPromptForClassification =
+                        PromptFormat.getFormattedLlamaGuardPrompt(rawPrompt);
+                    ETLogging.getInstance()
+                        .log("Running inference.. prompt=" + llamaGuardPromptForClassification);
+                    mModule.generate(
+                        llamaGuardPromptForClassification,
+                        llamaGuardPromptForClassification.length() + 64,
+                        MainActivity.this,
+                        false);
+                  } else {
+                    ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt);
+                    mModule.generate(
+                        finalPrompt,
+                        (int) (finalPrompt.length() * 0.75) + 64,
+                        MainActivity.this,
+                        false);
+                  }
+
                   long generateDuration = System.currentTimeMillis() - generateStartTime;
                   mResultMessage.setTotalGenerationTime(generateDuration);
                   runOnUiThread(
@@ -612,7 +735,7 @@ public void run() {
                   ETLogging.getInstance().log("Inference completed");
                 }
               };
-          new Thread(runnable).start();
+          executor.execute(runnable);
         });
     mMessageAdapter.notifyDataSetChanged();
   }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
index d9cbd95a1a7..2538c852e48 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
@@ -16,6 +16,7 @@
 import android.widget.ImageView;
 import android.widget.TextView;
 import java.util.ArrayList;
+import java.util.Collections;
 
 public class MessageAdapter extends ArrayAdapter<Message> {
 
@@ -90,4 +91,41 @@ public void clear() {
   public ArrayList<Message> getSavedMessages() {
     return savedMessages;
   }
+
+  public ArrayList<Message> getRecentSavedTextMessages(int numOfLatestPromptMessages) {
+    ArrayList<Message> recentMessages = new ArrayList<Message>();
+    int lastIndex = savedMessages.size() - 1;
+    Message messageToAdd = savedMessages.get(lastIndex);
+    int oldPromptID = messageToAdd.getPromptID();
+
+    for (int i = 0; i < savedMessages.size(); i++) {
+      messageToAdd = savedMessages.get(lastIndex - i);
+      if (messageToAdd.getMessageType() != MessageType.SYSTEM) {
+        if (messageToAdd.getPromptID() != oldPromptID) {
+          numOfLatestPromptMessages--;
+          oldPromptID = messageToAdd.getPromptID();
+        }
+        if (numOfLatestPromptMessages > 0) {
+          if (messageToAdd.getMessageType() == MessageType.TEXT) {
+            recentMessages.add(messageToAdd);
+          }
+        } else {
+          break;
+        }
+      }
+    }
+
+    // To place the order in [input1, output1, input2, output2...]
+    Collections.reverse(recentMessages);
+    return recentMessages;
+  }
+
+  public int getMaxPromptID() {
+    int maxPromptID = -1;
+    for (Message msg : savedMessages) {
+
+      maxPromptID = Math.max(msg.getPromptID(), maxPromptID);
+    }
+    return maxPromptID;
+  }
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java
index 91e84be0590..a241ca3d52d 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java
@@ -12,4 +12,5 @@ public enum ModelType {
   LLAMA_3,
   LLAMA_3_1,
   LLAVA_1_5,
+  LLAMA_GUARD_3,
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
new file mode 100644
index 00000000000..ab1f1bc92fc
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+public class ModelUtils {
+  static final int TEXT_MODEL = 1;
+  static final int VISION_MODEL = 2;
+  static final int VISION_MODEL_IMAGE_CHANNELS = 3;
+  static final int VISION_MODEL_SEQ_LEN = 768;
+  static final int TEXT_MODEL_SEQ_LEN = 256;
+
+  public static int getModelCategory(ModelType modelType) {
+    switch (modelType) {
+      case LLAVA_1_5:
+        return VISION_MODEL;
+      case LLAMA_3:
+      case LLAMA_3_1:
+      default:
+        return TEXT_MODEL;
+    }
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
index 72990f4ea8b..14cf38e669d 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
@@ -12,6 +12,8 @@ public class PromptFormat {
 
   public static final String SYSTEM_PLACEHOLDER = "{{ system_prompt }}";
   public static final String USER_PLACEHOLDER = "{{ user_prompt }}";
+  public static final String ASSISTANT_PLACEHOLDER = "{{ assistant_response }}";
+  public static final String DEFAULT_SYSTEM_PROMPT = "Answer the questions in a few sentences";
 
   public static String getSystemPromptTemplate(ModelType modelType) {
     switch (modelType) {
@@ -21,6 +23,7 @@ public static String getSystemPromptTemplate(ModelType modelType) {
             + SYSTEM_PLACEHOLDER
             + "<|eot_id|>";
       case LLAVA_1_5:
+        return "USER: ";
       default:
         return SYSTEM_PLACEHOLDER;
     }
@@ -30,13 +33,85 @@ public static String getUserPromptTemplate(ModelType modelType) {
     switch (modelType) {
       case LLAMA_3:
       case LLAMA_3_1:
+      case LLAMA_GUARD_3:
         return "<|start_header_id|>user<|end_header_id|>\n"
             + USER_PLACEHOLDER
-            + "<|eot_id|>\n"
+            + "<|eot_id|>"
             + "<|start_header_id|>assistant<|end_header_id|>";
+
+      case LLAVA_1_5:
+      default:
+        return USER_PLACEHOLDER;
+    }
+  }
+
+  public static String getConversationFormat(ModelType modelType) {
+    switch (modelType) {
+      case LLAMA_3:
+      case LLAMA_3_1:
+        return getUserPromptTemplate(modelType) + "\n" + ASSISTANT_PLACEHOLDER + "<|eot_id|>";
       case LLAVA_1_5:
+        return USER_PLACEHOLDER + " ASSISTANT:";
       default:
         return USER_PLACEHOLDER;
     }
   }
+
+  public static String getStopToken(ModelType modelType) {
+    switch (modelType) {
+      case LLAMA_3:
+      case LLAMA_3_1:
+      case LLAMA_GUARD_3:
+        return "<|eot_id|>";
+      case LLAVA_1_5:
+        return "</s>";
+      default:
+        return "";
+    }
+  }
+
+  public static String getLlavaPresetPrompt() {
+    return "A chat between a curious human and an artificial intelligence assistant. The assistant"
+        + " gives helpful, detailed, and polite answers to the human's questions. USER: ";
+  }
+
+  public static String getFormattedLlamaGuardPrompt(String userPrompt) {
+    return getUserPromptTemplate(ModelType.LLAMA_GUARD_3)
+        .replace(
+            USER_PLACEHOLDER, getLlamaGuardPresetPrompt().replace(USER_PLACEHOLDER, userPrompt));
+  }
+
+  public static String getLlamaGuardPresetPrompt() {
+    return "Task: Check if there is unsafe content in 'User' messages in conversations"
+        + " according our safety policy with the below categories.\n"
+        + "\n"
+        + "<BEGIN UNSAFE CONTENT CATEGORIES>\n"
+        + "S1: Violent Crimes. \n"
+        + "S2: Non-Violent Crimes. \n"
+        + "S3: Sex Crimes. \n"
+        + "S4: Child Exploitation. \n"
+        + "S5: Defamation. \n"
+        + "S6: Specialized Advice. \n"
+        + "S7: Privacy. \n"
+        + "S8: Intellectual Property. \n"
+        + "S9: Indiscriminate Weapons. \n"
+        + "S10: Hate. \n"
+        + "S11: Self-Harm. \n"
+        + "S12: Sexual Content. \n"
+        + "S13: Elections. \n"
+        + "S14: Code Interpreter Abuse.\n"
+        + "<END UNSAFE CONTENT CATEGORIES>\n"
+        + "\n"
+        + "<BEGIN CONVERSATION>\n"
+        + "\n"
+        + "User: "
+        + USER_PLACEHOLDER
+        + "\n"
+        + "<END CONVERSATION>\n"
+        + "Provide your safety assessment for ONLY THE LAST User message in the above"
+        + " conversation:\n"
+        + " - First line must read 'safe' or 'unsafe'.\n"
+        + " - If unsafe, a second line must include a comma-separated list of violated"
+        + " categories.";
+  }
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
index 5f1fc96e1ac..773fef19dd7 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
@@ -10,6 +10,7 @@
 
 import android.app.AlertDialog;
 import android.content.DialogInterface;
+import android.os.Build;
 import android.os.Bundle;
 import android.text.Editable;
 import android.text.TextWatcher;
@@ -18,6 +19,7 @@
 import android.widget.ImageButton;
 import android.widget.TextView;
 import androidx.appcompat.app.AppCompatActivity;
+import androidx.core.content.ContextCompat;
 import androidx.core.graphics.Insets;
 import androidx.core.view.ViewCompat;
 import androidx.core.view.WindowInsetsCompat;
@@ -43,12 +45,16 @@ public class SettingsActivity extends AppCompatActivity {
   public SettingsFields mSettingsFields;
 
   private DemoSharedPreferences mDemoSharedPreferences;
-  public static double TEMPERATURE_MIN_VALUE = 0.1;
+  public static double TEMPERATURE_MIN_VALUE = 0.0;
 
   @Override
   protected void onCreate(Bundle savedInstanceState) {
     super.onCreate(savedInstanceState);
     setContentView(R.layout.activity_settings);
+    if (Build.VERSION.SDK_INT >= 21) {
+      getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar));
+      getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar));
+    }
     ViewCompat.setOnApplyWindowInsetsListener(
         requireViewById(R.id.main),
         (v, insets) -> {
@@ -120,6 +126,7 @@ private void setupLoadModelButton() {
                     public void onClick(DialogInterface dialog, int whichButton) {
                       mSettingsFields.saveLoadModelAction(true);
                       mLoadModelButton.setEnabled(false);
+                      onBackPressed();
                     }
                   })
               .setNegativeButton(android.R.string.no, null)
@@ -208,8 +215,7 @@ public void afterTextChanged(Editable s) {
                   new DialogInterface.OnClickListener() {
                     public void onClick(DialogInterface dialog, int whichButton) {
                       // Clear the messageAdapter and sharedPreference
-                      mSystemPromptEditText.setText(
-                          PromptFormat.getSystemPromptTemplate(mModelType));
+                      mSystemPromptEditText.setText(PromptFormat.DEFAULT_SYSTEM_PROMPT);
                     }
                   })
               .setNegativeButton(android.R.string.no, null)
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
index 466d3303e28..b71799981b2 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
@@ -38,12 +38,12 @@ public String getFormattedSystemAndUserPrompt(String prompt) {
     return getFormattedSystemPrompt() + getFormattedUserPrompt(prompt);
   }
 
-  private String getFormattedSystemPrompt() {
+  public String getFormattedSystemPrompt() {
     return PromptFormat.getSystemPromptTemplate(modelType)
         .replace(PromptFormat.SYSTEM_PLACEHOLDER, systemPrompt);
   }
 
-  private String getFormattedUserPrompt(String prompt) {
+  public String getFormattedUserPrompt(String prompt) {
     return userPrompt.replace(PromptFormat.USER_PLACEHOLDER, prompt);
   }
 
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml
index 70f251ee649..0868ffffa6f 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml
@@ -1,7 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
 <shape xmlns:android="http://schemas.android.com/apk/res/android"
     android:shape="rectangle">
-    <solid android:color="#C60C4FC3" />
-    <corners android:bottomRightRadius="8dp"
-        android:bottomLeftRadius="8dp" />
+    <solid android:color="#16293D" />
 </shape>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml
index 9f83b8fbe79..2ae27b8409e 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml
@@ -1,4 +1,4 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#FFFFFF" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
       
     <path android:fillColor="@android:color/white" android:pathData="M19,13h-6v6h-2v-6H5v-2h6V5h2v6h6v2z"/>
     
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml
index d710d27110a..7077fedd483 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml
@@ -1,4 +1,4 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#FFFFFF" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
       
     <path android:fillColor="@android:color/white" android:pathData="M19,7v2.99s-1.99,0.01 -2,0L17,7h-3s0.01,-1.99 0,-2h3L17,2h2v3h3v2h-3zM16,11L16,8h-3L13,5L5,5c-1.1,0 -2,0.9 -2,2v12c0,1.1 0.9,2 2,2h12c1.1,0 2,-0.9 2,-2v-8h-3zM5,19l3,-4 2,3 3,-4 4,5L5,19z"/>
     
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml
index 30d5d26b985..a6837b9c69f 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml
@@ -1,4 +1,5 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:autoMirrored="true" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:autoMirrored="true" android:height="24dp" android:tint="#FFFFFF
+" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
       
     <path android:fillColor="@android:color/white" android:pathData="M19,3L5,3c-1.1,0 -2,0.9 -2,2v14c0,1.1 0.9,2 2,2h14c1.1,0 2,-0.9 2,-2L21,5c0,-1.1 -0.9,-2 -2,-2zM14,17L7,17v-2h7v2zM17,13L7,13v-2h10v2zM17,9L7,9L7,7h10v2z"/>
     
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml
index f8ca0c64b98..fb902d4331b 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml
@@ -1,4 +1,5 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#FFFFFF
+" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
       
     <path android:fillColor="@android:color/white" android:pathData="M19,6.41L17.59,5 12,10.59 6.41,5 5,6.41 10.59,12 5,17.59 6.41,19 12,13.41 17.59,19 19,17.59 13.41,12z"/>
     
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml
index 2c71fc6e568..4680bc6629e 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml
@@ -1,4 +1,4 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#FFFFFF" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
       
     <path android:fillColor="@android:color/white" android:pathData="M6,19c0,1.1 0.9,2 2,2h8c1.1,0 2,-0.9 2,-2L18,7L6,7v12zM8.46,11.88l1.41,-1.41L12,12.59l2.12,-2.12 1.41,1.41L13.41,14l2.12,2.12 -1.41,1.41L12,15.41l-2.12,2.12 -1.41,-1.41L10.59,14l-2.13,-2.12zM15.5,4l-1,-1h-5l-1,1L5,4v2h14L19,4z"/>
     
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml
index 9285db079aa..860470ab109 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml
@@ -1,4 +1,4 @@
-<vector android:height="24dp" android:tint="#000000"
+<vector android:height="24dp" android:tint="#FFFFFF"
     android:viewportHeight="24" android:viewportWidth="24"
     android:width="24dp" xmlns:android="http://schemas.android.com/apk/res/android">
     <path android:fillColor="@android:color/white" android:pathData="M12,5V2L8,6l4,4V7c3.31,0 6,2.69 6,6c0,2.97 -2.17,5.43 -5,5.91v2.02c3.95,-0.49 7,-3.85 7,-7.93C20,8.58 16.42,5 12,5z"/>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml
index 3abc6cb33be..2de1f642089 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml
@@ -1,5 +1,6 @@
 <vector android:autoMirrored="true" android:height="24dp"
-    android:tint="#000000" android:viewportHeight="24"
+    android:tint="#FFFFFF
+" android:viewportHeight="24"
     android:viewportWidth="24" android:width="24dp" xmlns:android="http://schemas.android.com/apk/res/android">
     <path android:fillColor="@android:color/white" android:pathData="M2.01,21L23,12 2.01,3 2,10l15,2 -15,2z"/>
 </vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml
index 42593b298e9..c51d84b9f4f 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml
@@ -3,7 +3,8 @@
     android:height="24dp"
     android:viewportWidth="960"
     android:viewportHeight="960"
-    android:tint="#000000">
+    android:tint="#FFFFFF
+">
     <path
         android:fillColor="@android:color/black"
         android:pathData="M387.69,860L372.46,738.15Q356.39,732.77 339.5,723.08Q322.62,713.38 309.31,702.31L196.46,750L104.16,590L201.77,516.23Q200.39,507.31 199.81,498.31Q199.23,489.31 199.23,480.38Q199.23,471.85 199.81,463.04Q200.39,454.23 201.77,443.77L104.16,370L196.46,210.77L308.92,258.08Q323.39,246.62 339.81,237.12Q356.23,227.62 372.08,221.85L387.69,100L572.31,100L587.54,222.23Q605.54,228.77 620.11,237.5Q634.69,246.23 649.54,258.08L763.54,210.77L855.84,370L756.69,444.92Q758.84,454.61 759.04,463.04Q759.23,471.46 759.23,480Q759.23,488.15 758.84,496.58Q758.46,505 756.08,515.85L854.46,590L762.15,750L649.54,701.92Q634.69,713.77 619.23,722.88Q603.77,732 587.54,737.77L572.31,860L387.69,860ZM440,800L518.62,800L533,692.85Q563.62,684.85 588.96,670.12Q614.31,655.38 637.85,632.23L737.23,674L776.62,606L689.85,540.62Q694.85,525.08 696.65,510.15Q698.46,495.23 698.46,480Q698.46,464.38 696.65,449.85Q694.85,435.31 689.85,420.15L777.38,354L738,286L637.46,328.38Q617.38,306.92 589.35,290.46Q561.31,274 532.62,267.15L520,160L440.62,160L427.38,266.77Q396.77,274 370.85,288.92Q344.92,303.85 321.38,327.38L222,286L182.62,354L269,418.38Q264,432.62 262,448Q260,463.38 260,480.38Q260,496 262,511Q264,526 268.62,540.62L182.62,606L222,674L321,632Q343.77,655.38 369.69,670.31Q395.62,685.23 427,693.23L440,800ZM480.46,600Q530.38,600 565.42,564.96Q600.46,529.92 600.46,480Q600.46,430.08 565.42,395.04Q530.38,360 480.46,360Q429.92,360 395.19,395.04Q360.46,430.08 360.46,480Q360.46,529.92 395.19,564.96Q429.92,600 480.46,600ZM480,480L480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480L480,480Z"/>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml
index 817d57b76a8..832e2585954 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml
@@ -1,4 +1,5 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#FFFFFF
+" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
       
     <path android:fillColor="@android:color/white" android:pathData="M6,6h12v12H6z"/>
     
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml
new file mode 100644
index 00000000000..eb8b9d1f1a9
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml
@@ -0,0 +1,21 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:aapt="http://schemas.android.com/aapt"
+    android:width="412dp"
+    android:height="893dp"
+    android:viewportWidth="412"
+    android:viewportHeight="893">
+  <path
+      android:pathData="M0,0h412v893h-412z">
+    <aapt:attr name="android:fillColor">
+      <gradient 
+          android:startX="206"
+          android:startY="0"
+          android:endX="206"
+          android:endY="893"
+          android:type="linear">
+        <item android:offset="0.05" android:color="#FF16293D"/>
+        <item android:offset="0.9" android:color="#FF192E4D"/>
+      </gradient>
+    </aapt:attr>
+  </path>
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml
new file mode 100644
index 00000000000..0a7a71f0700
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml
@@ -0,0 +1,9 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    android:width="24dp"
+    android:height="18dp"
+    android:viewportWidth="15"
+    android:viewportHeight="10">
+  <path
+      android:pathData="M15,2.373L7.5,10L0,2.373L2.375,0L7.5,5.212L12.625,0L15,2.373Z"
+      android:fillColor="#F4F4F4"/>
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml
index 15c404c60df..35c778a437d 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml
@@ -1,10 +1,7 @@
 <?xml version="1.0" encoding="utf-8"?>
 <shape xmlns:android="http://schemas.android.com/apk/res/android"
     android:shape="rectangle">
-    <solid android:color="#CFCCCC" />
-    <stroke
-        android:width="2dp"
-        android:color= "#F4F4F4"/>
+    <solid android:color="#081D2C" />
     <corners android:radius="20dp"/>
-    <padding android:top="10dp" android:bottom="10dp" android:left="10dp" android:right="10dp"/>
+    <padding android:layout_marginTop="5dp" android:layout_marginBottom="5dp" android:left="10dp" android:right="10dp"/>
 </shape>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml
index c8b2c96d585..bb45d63d85b 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml
@@ -1,4 +1,5 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="48dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="48dp">
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="48dp" android:tint="#ffffff
+" android:viewportHeight="24" android:viewportWidth="24" android:width="48dp">
       
     <path android:fillColor="@android:color/white" android:pathData="M19,3L5,3c-1.11,0 -2,0.9 -2,2v14c0,1.1 0.89,2 2,2h14c1.1,0 2,-0.9 2,-2L21,5c0,-1.1 -0.9,-2 -2,-2zM19,19L5,19L5,5h14v14zM11,17h2v-4h4v-2h-4L13,7h-2v4L7,11v2h4z"/>
     
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml
deleted file mode 100644
index a8c859d8b36..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
-      
-    <path android:fillColor="@android:color/white" android:pathData="M12,4c4.41,0 8,3.59 8,8s-3.59,8 -8,8 -8,-3.59 -8,-8 3.59,-8 8,-8m0,-2C6.48,2 2,6.48 2,12s4.48,10 10,10 10,-4.48 10,-10S17.52,2 12,2zM12,15l-4,-4h8z"/>
-    
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml
index 1627ed98c0d..5f81396e382 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <shape xmlns:android="http://schemas.android.com/apk/res/android"
 android:shape="rectangle">
-<solid android:color="#D3D3D3" />
+<solid android:color="#081D2C" />
 <corners android:radius="4dp" />
 </shape>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml
index ea2d1bbfa14..c2288b5bfce 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <shape xmlns:android="http://schemas.android.com/apk/res/android"
     android:shape="rectangle">
-    <solid android:color="#fff" />
+    <solid android:color="#081D2C" />
     <corners android:radius="10dp" />
 </shape>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml
index ec215e63ba1..7b8b8d1760d 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml
@@ -20,38 +20,32 @@
         <TextView
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:paddingLeft="10dp"
+            android:paddingLeft="20dp"
             android:paddingTop="20dp"
             android:text="Chat with Llama"
             android:textColor="@android:color/white"
-            android:textSize="20sp"
+            android:textSize="16sp"
             android:textStyle="bold" />
 
-        <View
-            android:layout_width="0dp"
-            android:layout_height="0dp"
-            android:layout_weight="1" />
-
         <TextView
             android:id="@+id/ram_usage_live"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
             android:layout_weight="1"
             android:fontFamily="sans-serif-black"
+            android:paddingLeft="5dp"
             android:text="0 MB"
             android:textAlignment="viewEnd"
             android:textColor="#FFFFFF"
-            android:textSize="16sp"
-            android:paddingLeft="5dp"/>
+            android:textSize="14sp" />
 
         <ImageButton
             android:id="@+id/showLogsButton"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:paddingTop="20dp"
             android:backgroundTint="@android:color/transparent"
-            android:src="@drawable/baseline_article_24"
-            />
+            android:paddingTop="20dp"
+            android:src="@drawable/baseline_article_24" />
 
         <ImageButton
             android:id="@+id/settings"
@@ -74,7 +68,7 @@
             android:layout_width="match_parent"
             android:layout_height="wrap_content"
             android:layout_weight="2"
-            android:background="#DCD7D7"
+            android:background="@drawable/chat_background"
             android:divider="#fff"
             android:stackFromBottom="true"
             android:transcriptMode="alwaysScroll" />
@@ -83,7 +77,7 @@
             android:id="@+id/mediaPreviewConstraintLayout"
             android:layout_width="match_parent"
             android:layout_height="wrap_content"
-            android:background="#edf0ee"
+            android:background="#16293D"
             android:visibility="gone">
 
             <HorizontalScrollView
@@ -143,7 +137,7 @@
                         android:id="@+id/addMoreImageButton"
                         android:layout_width="80dp"
                         android:layout_height="80dp"
-                        android:background="#ebebeb"
+                        android:background="#16293D"
                         android:padding="5dp"
                         android:src="@drawable/outline_add_box_48" />
 
@@ -169,7 +163,7 @@
         <LinearLayout
             android:layout_width="match_parent"
             android:layout_height="wrap_content"
-            android:background="#F4F4F4"
+            android:background="#16293D"
             android:orientation="horizontal">
 
             <ImageButton
@@ -182,15 +176,17 @@
 
             <EditText
                 android:id="@+id/editTextMessage"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
+                android:layout_width="match_parent"
+                android:layout_height="35dp"
                 android:layout_weight="2"
                 android:background="@drawable/input_text_shape"
-                android:ems="10"
-                android:hint="Text message"
+                android:ems="8"
                 android:inputType="text"
                 android:paddingHorizontal="10dp"
-                android:text="" />
+                android:text=""
+                android:textColor="#ffffff"
+                android:textColorHint="#ffffff"
+                android:translationY="5dp" />
 
             <ImageButton
                 android:id="@+id/sendButton"
@@ -205,7 +201,7 @@
             android:id="@+id/addMediaLayout"
             android:layout_width="match_parent"
             android:layout_height="wrap_content"
-            android:background="#E9EFEC"
+            android:background="#16293D"
             android:orientation="vertical">
 
             <LinearLayout
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml
index 1a5a12f5dab..7d5c3b1b6df 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml
@@ -10,19 +10,22 @@
     <LinearLayout
         android:layout_width="match_parent"
         android:layout_height="match_parent"
-        android:layout_marginTop="20dp"
-        android:layout_marginLeft="10dp"
-        android:layout_marginRight="10dp"
+        android:background="#16293D"
         android:orientation="vertical"
         app:layout_constraintTop_toTopOf="parent"
         tools:layout_editor_absoluteX="1dp">
+
         <TextView
             android:id="@+id/textView"
             android:layout_width="match_parent"
-            android:layout_height="28dp"
+            android:layout_height="wrap_content"
+            android:fontFamily="sans-serif-medium"
             android:text="Settings"
-            android:textAlignment="center"
-            android:textSize="20sp" />
+            android:textAlignment="viewStart"
+            android:textColor="#FFFFFF"
+            android:textSize="22sp"
+            android:translationX="5dp"
+            android:translationY="5dp" />
 
         <LinearLayout
             android:layout_width="match_parent"
@@ -33,10 +36,12 @@
             <TextView
                 android:id="@+id/modelLabel"
                 android:layout_width="wrap_content"
-                android:layout_height="match_parent"
+                android:layout_height="wrap_content"
                 android:gravity="center_vertical"
+                android:text="Model"
+                android:textColor="#FFFFFF"
                 android:textSize="16sp"
-                android:text="Model" />
+                android:translationX="5dp" />
 
             <TextView
                 android:id="@+id/modelTextView"
@@ -44,15 +49,19 @@
                 android:layout_height="match_parent"
                 android:layout_weight="1"
                 android:gravity="center_vertical|end"
-                android:text="no model selected" />
+                android:text="no model selected"
+                android:textColor="#FFFFFF" />
 
             <ImageButton
                 android:id="@+id/modelImageButton"
                 android:layout_width="wrap_content"
                 android:layout_height="wrap_content"
                 android:layout_marginStart="5dp"
-                android:background="#FFFFFF"
-                android:src="@drawable/outline_arrow_drop_down_circle_24"/>
+                android:background="#00FFFFFF"
+                android:scaleType="center"
+                android:scaleX="0.7"
+                android:scaleY="0.7"
+                android:src="@drawable/expand_circle_down" />
 
         </LinearLayout>
 
@@ -65,10 +74,12 @@
             <TextView
                 android:id="@+id/tokenizerLabel"
                 android:layout_width="wrap_content"
-                android:layout_height="match_parent"
+                android:layout_height="wrap_content"
                 android:gravity="center_vertical"
+                android:text="Tokenizer"
+                android:textColor="#FDFDFD"
                 android:textSize="16sp"
-                android:text="Tokenizer" />
+                android:translationX="5dp" />
 
             <TextView
                 android:id="@+id/tokenizerTextView"
@@ -76,15 +87,18 @@
                 android:layout_height="match_parent"
                 android:layout_weight="1"
                 android:gravity="center_vertical|end"
-                android:text="no tokenizer selected" />
+                android:text="no tokenizer selected"
+                android:textColor="#FFFFFF" />
 
             <ImageButton
                 android:id="@+id/tokenizerImageButton"
                 android:layout_width="wrap_content"
                 android:layout_height="wrap_content"
                 android:layout_marginStart="5dp"
-                android:background="#FFFFFF"
-                android:src="@drawable/outline_arrow_drop_down_circle_24" />
+                android:background="#00FFFFFF"
+                android:scaleX="0.7"
+                android:scaleY="0.7"
+                android:src="@drawable/expand_circle_down" />
 
         </LinearLayout>
 
@@ -97,10 +111,12 @@
             <TextView
                 android:id="@+id/modelTypeLabel"
                 android:layout_width="wrap_content"
-                android:layout_height="match_parent"
+                android:layout_height="wrap_content"
                 android:gravity="center_vertical"
+                android:text="Model Type"
+                android:textColor="#FFFFFF"
                 android:textSize="16sp"
-                android:text="Model Type" />
+                android:translationX="5dp" />
 
             <TextView
                 android:id="@+id/modelTypeTextView"
@@ -108,17 +124,21 @@
                 android:layout_height="match_parent"
                 android:layout_weight="1"
                 android:gravity="center_vertical|end"
-                android:text="no model type selected" />
+                android:text="no model type selected"
+                android:textColor="#FFFFFF" />
 
             <ImageButton
                 android:id="@+id/modelTypeImageButton"
                 android:layout_width="wrap_content"
                 android:layout_height="wrap_content"
                 android:layout_marginStart="5dp"
-                android:background="#FFFFFF"
-                android:src="@drawable/outline_arrow_drop_down_circle_24" />
+                android:background="#00FFFFFF"
+                android:scaleX="0.7"
+                android:scaleY="0.7"
+                android:src="@drawable/expand_circle_down" />
 
         </LinearLayout>
+
         <Button
             android:id="@+id/loadModelButton"
             android:layout_width="wrap_content"
@@ -127,8 +147,9 @@
             android:layout_marginTop="10dp"
             android:paddingHorizontal="10dp"
             android:text="Load Model"
-            android:theme="@style/DefaultButton"
-            android:textColor="@android:color/white"/>
+            android:textColor="@android:color/white"
+            android:textSize="14sp"
+            android:theme="@style/DefaultButton" />
 
         <TextView
             android:id="@+id/textView4"
@@ -136,9 +157,11 @@
             android:layout_height="wrap_content"
             android:layout_marginTop="20dp"
             android:layout_marginBottom="20dp"
+            android:text="Parameters"
+            android:textColor="#FFFFFF"
+            android:textSize="18sp"
             android:textStyle="bold"
-            android:textSize="20sp"
-            android:text="Parameters" />
+            android:translationX="5dp" />
 
         <LinearLayout
             android:layout_width="match_parent"
@@ -150,8 +173,10 @@
                 android:id="@+id/textView5"
                 android:layout_width="150dp"
                 android:layout_height="wrap_content"
+                android:text="Temperature"
+                android:textColor="#FFFFFF"
                 android:textSize="16sp"
-                android:text="Temperature" />
+                android:translationX="5dp" />
 
             <EditText
                 android:id="@+id/temperatureEditText"
@@ -159,9 +184,12 @@
                 android:layout_height="wrap_content"
                 android:layout_weight="1"
                 android:ems="10"
+                android:inputType="numberDecimal"
                 android:text="0.1"
                 android:textAlignment="textEnd"
-                android:inputType="numberDecimal" />
+                android:textColor="#FFFFFF"
+                android:textColorHint="#FFFFFF"
+                android:textSize="16sp" />
         </LinearLayout>
 
         <LinearLayout
@@ -181,17 +209,19 @@
                     android:layout_height="wrap_content"
                     android:layout_marginTop="20dp"
                     android:layout_marginBottom="20dp"
+                    android:text="System Prompt"
+                    android:textColor="#FFFAFA"
+                    android:textSize="18sp"
                     android:textStyle="bold"
-                    android:textSize="20sp"
-                    android:text="System Prompt" />
+                    android:translationX="5dp" />
 
                 <ImageButton
                     android:id="@+id/resetSystemPrompt"
                     android:layout_width="wrap_content"
                     android:layout_height="wrap_content"
+                    android:layout_marginTop="10dp"
                     android:backgroundTint="@android:color/transparent"
-                    android:src="@drawable/baseline_restart_alt_24"
-                    android:layout_marginTop="10dp" />
+                    android:src="@drawable/baseline_restart_alt_24" />
             </LinearLayout>
 
 
@@ -199,10 +229,12 @@
                 android:id="@+id/systemPromptText"
                 android:layout_width="match_parent"
                 android:layout_height="wrap_content"
-                android:background="@drawable/prompt_shape"
-                android:textSize="16sp"
                 android:height="60dp"
-                android:hint="Type custom system prompt" />
+                android:background="@drawable/prompt_shape"
+                android:hint="Type custom system prompt"
+                android:textColor="#FFFFFF"
+                android:textColorHint="#FFFCFC"
+                android:textSize="16sp" />
         </LinearLayout>
 
         <LinearLayout
@@ -222,42 +254,40 @@
                     android:layout_height="wrap_content"
                     android:layout_marginTop="20dp"
                     android:layout_marginBottom="20dp"
+                    android:text="Prompt Format"
+                    android:textColor="#FFFFFF"
+                    android:textSize="18sp"
                     android:textStyle="bold"
-                    android:textSize="20sp"
-                    android:text="Prompt Format" />
+                    android:translationX="5dp" />
 
                 <ImageButton
                     android:id="@+id/resetUserPrompt"
                     android:layout_width="wrap_content"
                     android:layout_height="wrap_content"
+                    android:layout_marginTop="10dp"
                     android:backgroundTint="@android:color/transparent"
-                    android:src="@drawable/baseline_restart_alt_24"
-                    android:layout_marginTop="10dp" />
+                    android:src="@drawable/baseline_restart_alt_24" />
 
             </LinearLayout>
 
             <EditText
                 android:id="@+id/userPromptText"
                 android:layout_width="match_parent"
-                android:layout_height="wrap_content"
+                android:layout_height="69dp"
                 android:background="@drawable/prompt_shape"
-                android:textSize="16sp"
-                android:text = "USER_PROMPT tags" />
+                android:text="USER_PROMPT tags"
+                android:textColor="#FFFFFF"
+                android:textSize="16sp" />
         </LinearLayout>
 
-        <View
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_weight="1"/>
-
         <Button
             android:id="@+id/clearChatButton"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
             android:layout_gravity="center"
             android:text="Clear Chat History"
-            android:theme="@style/DefaultButton"
-            android:textColor="@android:color/white"/>
+            android:textColor="@android:color/white"
+            android:theme="@style/DefaultButton" />
 
     </LinearLayout>
 
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml
index 0fb38f31128..bffedf30c87 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml
@@ -13,52 +13,58 @@
         android:layout_height="wrap_content"
         android:layout_marginLeft="15dp"
         android:paddingBottom="4dp"
-        android:text="Llama" />
+        android:text="Llama"
+        android:textColor="#FFFFFF" />
 
     <TextView
+        android:id="@+id/message_text"
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
-        android:id="@+id/message_text"
         android:layout_below="@+id/name"
         android:layout_alignLeft="@+id/name"
         android:background="@drawable/received_message"
-        android:paddingVertical="12dp"
-        android:paddingHorizontal="16dp"
         android:elevation="2dp"
-        android:textSize="18dp"
+        android:paddingHorizontal="16dp"
+        android:paddingVertical="12dp"
         android:text="Generated text"
-        />
+        android:textColor="#FFFFFF"
+        android:textSize="16sp" />
 
     <LinearLayout
         android:id="@+id/subtitles"
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
         android:layout_below="@+id/message_text">
+
         <TextView
             android:id="@+id/timestamp"
-            android:layout_marginLeft="15dp"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:paddingBottom="4dp"
+            android:layout_marginLeft="15dp"
             android:paddingLeft="4dp"
-            android:text=""/>
+            android:paddingBottom="4dp"
+            android:text=""
+            android:textColor="#FFFFFF" />
+
         <TextView
             android:id="@+id/bar"
-            android:layout_marginLeft="15dp"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:paddingBottom="4dp"
+            android:layout_marginLeft="15dp"
             android:paddingLeft="4dp"
+            android:paddingBottom="4dp"
             android:text="|"
-            android:visibility="gone"/>
+            android:textColor="#FFFFFF"
+            android:visibility="gone" />
 
         <TextView
             android:id="@+id/generation_metrics"
-            android:layout_marginLeft="15dp"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
+            android:layout_marginLeft="15dp"
             android:layout_toRightOf="@+id/bar"
             android:paddingBottom="4dp"
-            android:text=""/>
+            android:text=""
+            android:textColor="#FDFDFD" />
     </LinearLayout>
 </RelativeLayout>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml
index 7cf080d444b..a04254e38a3 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml
@@ -18,14 +18,14 @@
 
         <TextView
             android:id="@+id/name"
-            android:layout_marginRight="15dp"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:layout_alignParentRight="true"
-            android:layout_alignParentTop="true"
             android:layout_above="@+id/message_text"
+            android:layout_alignParentTop="true"
+            android:layout_alignParentRight="true"
+            android:layout_marginRight="15dp"
             android:paddingBottom="4dp"
-            android:text="Prompt"/>
+            android:textColor="#FFFFFF" />
 
         <TextView
             android:id="@+id/message_text"
@@ -37,7 +37,7 @@
             android:padding="10dp"
             android:text="My prompt"
             android:textColor="#fff"
-            android:textSize="18dp" />
+            android:textSize="16sp" />
 
         <ImageView
             android:id="@+id/message_image"
@@ -51,12 +51,13 @@
 
     <TextView
         android:id="@+id/timestamp"
-        android:layout_marginRight="10dp"
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
         android:layout_below="@+id/message_content"
-        android:paddingBottom="4dp"
         android:layout_alignParentRight="true"
-        android:text=""/>
+        android:layout_marginRight="10dp"
+        android:paddingBottom="4dp"
+        android:text=""
+        android:textColor="#FFFFFF" />
 
 </RelativeLayout>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/colors.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/colors.xml
index 25f13893c84..069727f3eb4 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/colors.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/colors.xml
@@ -1,8 +1,10 @@
 <?xml version="1.0" encoding="utf-8"?>
 <resources>
-    <color name="colorPrimary">#6200EE</color>
+    <color name="colorPrimary">#4294F0</color>
     <color name="colorPrimaryDark">#3700B3</color>
     <color name="colorAccent">#03DAC5</color>
     <color name="btn_enabled">#007CBA</color>
     <color name="btn_disabled">#A2A4B6</color>
+    <color name="nav_bar">#16293D</color>
+    <color name="status_bar">#16293D</color>
 </resources>
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
new file mode 100644
index 00000000000..e820055ecfd
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
@@ -0,0 +1,157 @@
+# Building ExecuTorch Android Demo for Llama running MediaTek
+This tutorial covers the end to end workflow for running Llama 3-8B-instruct inference on MediaTek AI accelerators on an Android device.
+More specifically, it covers:
+1. Export and quantization of Llama models against the MediaTek backend.
+2. Building and linking libraries that are required to inference on-device for Android platform using MediaTek AI accelerators.
+3. Loading the needed files on the device and running inference.
+
+Verified on MacOS, Linux CentOS (model export), Python 3.10, Android NDK 25.0.8775105
+Phone verified: MediaTek Dimensity 9300 (D9300) chip.
+
+## Prerequisites
+* Download and link the Buck2 build, Android NDK, and MediaTek ExecuTorch Libraries from the MediaTek Backend Readme ([link](https://github.com/pytorch/executorch/tree/main/backends/mediatek/scripts#prerequisites)).
+* MediaTek Dimensity 9300 (D9300) chip device
+* Desired Llama 3 model weights. You can download them on HuggingFace [Example](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)).
+* `libneuronusdk_adapter.mtk.so`,  `libneuron_buffer_allocator.so`, and `.whl` files  (will be available soon by MediaTek)
+
+## Setup ExecuTorch
+In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://anaconda.org/anaconda/conda)). The commands below are running on Linux (CentOS).
+
+Create a Conda environment
+```
+conda create -yn et_mtk python=3.10.0
+conda activate et_mtk
+```
+
+Checkout ExecuTorch repo and sync submodules
+```
+git clone https://github.com/pytorch/executorch.git
+cd executorch
+git submodule sync
+git submodule update --init
+```
+Install dependencies
+```
+./install_requirements.sh
+```
+## Setup Environment Variables
+### Download Buck2 and make executable
+* Download Buck2 from the official [Release Page](https://github.com/facebook/buck2/releases/tag/2024-02-01)
+* Create buck2 executable
+```
+zstd -cdq "<downloaded_buck2_file>.zst" > "<path_to_store_buck2>/buck2" && chmod +x "<path_to_store_buck2>/buck2"
+```
+
+### MediaTek ExecuTorch Libraries
+The following libraries will be available soon by MediaTek:
+libneuronusdk_adapter.mtk.so: This universal SDK contains the implementation required for executing target-dependent code on the MediaTek chip.
+libneuron_buffer_allocator.so: This utility library is designed for allocating DMA buffers necessary for model inference.
+
+### Set Environment Variables
+```
+export BUCK2=path_to_buck/buck2 # Download BUCK2 and create BUCK2 executable
+export ANDROID_NDK=path_to_android_ndk
+export NEURON_BUFFER_ALLOCATOR_LIB=path_to_buffer_allocator/libneuron_buffer_allocator.so
+```
+
+## Build Backend and MTK Llama Runner
+Next we need to build and compile the MTK backend and MTK Llama runner.
+```
+cd examples/mediatek
+./mtk_build_examples.sh
+```
+
+This will generate a cmake-android-out folder that will contain a runner executable for inferring with Llama models and another library file:
+* `cmake-android-out/examples/mediatek/mtk_llama_executor_runner`
+* `cmake-android-out/backends/mediatek/libneuron_backend.so`
+
+## Export Llama Model
+MTK currently supports Llama 3 exporting.
+
+### Set up Environment
+1. Follow the ExecuTorch set-up environment instructions found on the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html) page
+2. Set-up MTK AoT environment
+```
+// Ensure that you are inside executorch/examples/mediatek directory
+pip3 install -r requirements.txt
+
+// The following .whl file will be available soon
+pip3 install mtk_neuron-8.2.2-py3-none-linux_x86_64.whl
+pip3 install mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+```
+
+This was tested with transformers version 4.40 and numpy version 1.23. If you do not have these version then, use the following commands:
+```
+pip install transformers==4.40
+
+pip install numpy=1.23
+```
+
+### Running Export
+Prior to exporting, place the config.json, relevant tokenizer files and .bin or .safetensor weight files in `examples/mediatek/models/llm_models/weights`.
+
+Here is an export example ([details](https://github.com/pytorch/executorch/tree/main/examples/mediatek#aot-flow)):
+```
+cd examples/mediatek
+# num_chunks=4, num_tokens=128, cache_size=512
+source shell_scripts/export_llama.sh llama3 "" "" "" alpaca.txt
+```
+
+There will be 3 main set of files generated:
+* num_chunks*2 pte files: half are for prompt and the other half are for generation. Generation pte files are denoted by “1t” in the file name.
+* Token embedding bin file: located in the weights folder where `config.json` is placed (`examples/mediatek/modes/llm_models/weight/<model_name>/embedding_<model_name>_fp32.bin`)
+* Tokenizer file: `tokenizer.model` file
+
+Note: Exporting model flow can take 2.5 hours (114GB RAM for num_chunks=4) to complete. (Results may vary depending on hardware)
+
+Before continuing forward, make sure to modify the tokenizer, token embedding, and model paths in the  examples/mediatek/executor_runner/run_llama3_sample.sh.
+
+## Deploy Files on Device
+
+### Prepare to Deploy
+Prior to deploying the files on device, make sure to modify the tokenizer, token embedding, and model file names in  examples/mediatek/executor_runner/run_llama3_sample.sh reflect what was generated during the Export Llama Model step.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/mtk_changes_to_shell_file.png" style="width:600px">
+</p>
+
+In addition, create a sample_prompt.txt file with a prompt. This will be deployed to the device in the next step.
+* Example content of a sample_prompt.txt file:
+```
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+You are a helpful AI assistant for travel tips and recommendations<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+What can you help me with?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+```
+
+### Deploy
+First, make sure your Android phone’s chipset version is compatible with this demo (MediaTek Dimensity 9300 (D9300)) chip. Once you have the model, tokenizer, and runner generated ready, you can push them and the .so files to the device before we start running using the runner via shell.
+
+```
+adb shell mkdir -p /data/local/tmp/llama
+adb push examples/mediatek/executor_runner/run_llama3_sample.sh /data/local/tmp/llama
+adb push sample_prompt.txt /data/local/tmp/llama
+adb push cmake-android-out/examples/mediatek/mtk_llama_executor_runner /data/local/tmp/llama
+adb push cmake-android-out/backends/mediatek/libneuron_backend.so /data/local/tmp/llama
+adb push libneuron_buffer_allocator.so /data/local/tmp/llama
+adb push libneuronusdk_adapter.mtk.so /data/local/tmp/llama
+adb push embedding_<model_name>_fp32.bin /data/local/tmp/llama
+adb push tokenizer.model /data/local/tmp/llama
+```
+
+## Run Demo
+At this point we have pushed all the required files on the device and we are ready to run the demo!
+```
+adb shell
+
+<android_device>:/ $ cd data/local/tmp/llama
+<android_device>:/data/local/tmp/llama $ sh run_llama3_sample.sh
+```
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/mtk_output.png" style="width:800px">
+</p>
+
+## Reporting Issues
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
new file mode 100644
index 00000000000..c3f2a27e835
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
@@ -0,0 +1,228 @@
+# Building ExecuTorch Android Demo App for Llama running Qualcomm
+
+This tutorial covers the end to end workflow for building an android demo app using Qualcomm AI accelerators on device.
+More specifically, it covers:
+1. Export and quantization of Llama models against the Qualcomm backend.
+2. Building and linking libraries that are required to inference on-device for Android platform using Qualcomm AI accelerators.
+3. Building the Android demo app itself.
+
+Verified on Linux CentOS, QNN SDK [v2.26](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.26.0.240828.zip), python 3.10, Android SDK r27 and r26b.
+
+Phone verified: OnePlus 12, Samsung 24+, Samsung 23
+
+## Prerequisites
+* Download and unzip QNN SDK [v2.26](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.26.0.240828.zip)
+* Download and unzip Android SDK [r27](https://developer.android.com/ndk/downloads)
+* Android phone with Snapdragon8 Gen3 (SM8650) or Gen2 (SM8550). Gen 1 and lower SoC might be supported but not fully validated.
+* Desired Llama model weights in .PTH format. You can download them on HuggingFace ([Example](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)).
+
+## Setup ExecuTorch
+In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://anaconda.org/anaconda/conda)). The commands below are running on Linux (CentOS).
+
+Create a Conda environment
+```
+conda create -n et_qnn python=3.10.0
+conda activate et_qnn
+```
+
+Checkout ExecuTorch repo and sync submodules
+```
+git clone https://github.com/pytorch/executorch.git
+cd executorch
+git submodule sync
+git submodule update --init
+```
+Install dependencies
+```
+./install_requirements.sh
+```
+
+## Setup QNN
+```
+# Set these variables correctly for your environment
+export ANDROID_NDK_ROOT=$HOME/android-ndk-r27 # Download android SDK and unzip to home directory
+export QNN_SDK_ROOT=$HOME/Your-SDK-Root #Folder contains lib
+export EXECUTORCH_ROOT=$HOME/repos/executorch
+export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/:$LD_LIBRARY_PATH
+export PYTHONPATH=$EXECUTORCH_ROOT/..
+cp schema/program.fbs exir/_serialize/program.fbs
+cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
+```
+
+### Build QNN backend with ExecuTorch
+```
+./backends/qualcomm/scripts/build.sh --release
+
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DEXECUTORCH_ENABLE_LOGGING=1 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_QNN=ON \
+    -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -Bcmake-out .
+cmake --build cmake-out -j16 --target install --config Release
+```
+
+
+
+### Setup Llama Runner
+Next we need to build and compile the Llama runner. This is similar to the requirements for running Llama with XNNPack.
+```
+sh examples/models/llama2/install_requirements.sh
+
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_QNN=ON \
+    -Bcmake-out/examples/models/llama2 \
+    examples/models/llama2
+cmake --build cmake-out/examples/models/llama2 -j16 --config Release
+```
+
+## Export Llama Model
+QNN backend currently supports exporting to these data types: fp32, int4/ int8 with PTQ, int4 with SpinQuant (Llama 3 only).
+
+We also support export for different Qualcomm SoC. We have verified SM8650(V75) and SM8550(V73). To export for different SoC, add “--soc_model SM8550” in your export command. Without setting this flag, the export will default to SM8650.
+
+### Export with PTQ
+We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B). However, there is accuracy regression and we are working on improving it.
+8B models might need 16GB RAM on the device to run.
+
+Examples:
+```
+# 4 bits weight only quantize
+python -m examples.models.llama2.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
+```
+If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example:
+```
+# 8 bits quantization with 4 shards
+python -m examples.models.llama2.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
+```
+Note: if you encountered issues below
+```
+[ERROR] [Qnn ExecuTorch]: Cannot Open QNN library libQnnHtp.so, with error: libc++.so.1: cannot open shared object file: No such file or directory
+```
+
+Resolve by:
+
+* Install older QNN such as 2.23 or below and copy it from ${QNN_SDK_ROOT}/lib/x86_64-linux-clang
+* Install it with apt-get by yourself
+* Install it with script in ${QNN_SDK_ROOT}/bin/check-linux-dependency.sh
+You could refer to [QNN SDK document](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/setup.html?product=1601111740009302#linux-platform-dependencies)
+* Install it with Conda:
+```
+conda install -c conda-forge libcxx=14.0.0
+```
+
+After installment, you will need to check libc++.so.1 in your LD_LIBRARY_PATH or system lib. Refer to this [PR](https://github.com/pytorch/executorch/issues/5120) for more detail.
+
+You may also wonder what the "--metadata" flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily.
+
+Convert tokenizer for Llama 2
+```
+python -m extension.llm.tokenizer.tokenizer -t <tokenizer.model> -o tokenizer.bin
+```
+Convert tokenizer for Llama 3 - Rename tokenizer.model to tokenizer.bin.
+
+
+### Export with Spinquant (Llama 3 8B only)
+We also support Llama 3 8B for Spinquant where the accuracy regression is minimal.
+
+Deploying large language models like Llama 3 on-device presents the following challenges:
+* The model size is too large to fit in device memory for inference.
+* High model loading and inference time.
+* Difficulty in quantization.
+
+To address these challenges, we have implemented the following solutions:
+* Using --pt2e_quantize qnn_16a4w to quantize activations and weights, thereby reducing the on-disk model size and alleviating memory pressure during inference.
+* Using --num_sharding 8 to shard the model into sub-parts.
+* Performing graph transformations to convert or decompose operations into more accelerator-friendly operations.
+* Using --optimized_rotation_path <path_to_optimized_matrix> to apply R1 and R2 of [Spin Quant](https://github.com/facebookresearch/SpinQuant) to improve accuracy.
+* Using --calibration_data "<|start_header_id|>system<|end_header_id|..." to ensure that during the quantization of Llama 3 8B Instruct, the calibration includes special tokens in the prompt template. For more details on the prompt template, refer to the [model card](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/) of meta llama3 instruct.
+
+To get the optimized matrix, please refer to [SpinQuant](https://github.com/facebookresearch/SpinQuant) on GitHub. You can download the optimized rotation matrices in the Quantized Models section. Please choose "LLaMA-3-8B/8B_W4A16KV16_lr_1.5_seed_0".
+
+To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure the following:
+* The host machine has more than 100GB of memory (RAM + swap space).
+* The entire process takes a few hours.
+* 8B models might need 16GB RAM on the device to run.
+```
+# Please note that calibration_data must include the prompt template for special tokens.
+python -m examples.models.llama2.export_llama  -t <path_to_tokenizer.model> -p <path_to_params.json> -c <path_to_checkpoint_for_Meta-Llama-3-8B-Instruct>  --use_kv_cache  --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path <path_to_optimized_matrix> --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+```
+
+## Pushing Model and Tokenizer
+
+Once you have the model and tokenizer ready, you can push them to the device before we start building the android demo app.
+```
+adb shell mkdir -p /data/local/tmp/llama
+adb push llama-exported.pte /data/local/tmp/llama
+adb push tokenizer.bin /data/local/tmp/llama
+```
+
+
+
+## Build AAR Library
+Open a terminal window and navigate to the root directory of the executorch.
+Set the following environment variables:
+```
+export ANDROID_NDK=<path_to_android_ndk>
+export ANDROID_ABI=arm64-v8a
+```
+Note: <path_to_android_ndk> is the root for the NDK, which is usually under ~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md. We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.
+Build the Android Java extension code:
+```
+pushd extension/android
+./gradlew build
+popd
+```
+Run the following command set up the required JNI library:
+```
+pushd examples/demo-apps/android/LlamaDemo
+./gradlew :app:setupQnn
+popd
+```
+Alternative you can also just run the shell script directly as in the root directory:
+```
+sh examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
+```
+This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them, and copies them to jniLibs.
+Note: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting for QNN backend on Linux), make sure you copy the aar file generated from setup-with-qnn script to “examples/demo-apps/android/LlamaDemo/app/libs” before building the Android app.
+
+
+## Run the Android Demo App
+
+First, make sure your Android phone’s chipset version is compatible with this demo (SM8650, SM8550). You can find the Qualcomm chipset version here in the [mapping](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html).
+
+If you build and run the setup-with-qnn script on a separate machine rather than where you are building the Android app, make sure you copy the aar file it generated into “examples/demo-apps/android/LlamaDemo/app/libs”
+
+### Alternative 1: Android Studio (Recommended)
+Open Android Studio and select “Open an existing Android Studio project” to open examples/demo-apps/android/LlamaDemo.
+Run the app (^R). This builds and launches the app on the phone.
+
+### Alternative 2: Command line
+Without Android Studio UI, we can run gradle directly to build the app. We need to set up the Android SDK path and invoke gradle.
+```
+export ANDROID_HOME=<path_to_android_sdk_home>
+pushd examples/demo-apps/android/LlamaDemo
+./gradlew :app:installDebug
+popd
+```
+If the app successfully run on your device, you should see something like below:
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/opening_the_app_details.png" style="width:800px">
+</p>
+
+## Reporting Issues
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on Github.
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
new file mode 100644
index 00000000000..cb6193942ae
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
@@ -0,0 +1,156 @@
+# Building ExecuTorch Android Demo App for Llama running XNNPack
+
+This tutorial covers the end to end workflow for building an android demo app using CPU on device via XNNPack framework.
+More specifically, it covers:
+1. Export and quantization of Llama and Llava models against the XNNPack backend.
+2. Building and linking libraries that are required to inference on-device for Android platform.
+3. Building the Android demo app itself.
+
+Phone verified: OnePlus 12. Samsung S23 (Llama only), Samsung S24+ (Llama only), Pixel 8 Pro (Llama only)
+
+
+## Known Issues
+* With prompts like “What is the maxwell equation” the runner+jni is unable to handle odd unicodes.
+
+## Prerequisites
+* Install [Java 17 JDK](https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html).
+* Install the [Android SDK API Level 34](https://developer.android.com/about/versions/14/setup-sdk) and [Android NDK 25.0.8775105](https://developer.android.com/studio/projects/install-ndk).
+* If you have Android Studio set up, you can install them with
+  * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Platforms -> Check the row with API Level 34.
+  * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Tools -> Check NDK (Side by side) row.
+* Alternatively, you can follow [this guide](https://github.com/pytorch/executorch/blob/856e085b9344c8b0bf220a97976140a5b76356aa/examples/demo-apps/android/LlamaDemo/SDK.md) to set up Java/SDK/NDK with CLI.
+Supported Host OS: CentOS, macOS Sonoma on Apple Silicon.
+
+
+Note: This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis), with NDK 25.0.8775105.
+
+
+
+## Setup ExecuTorch
+In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://anaconda.org/anaconda/conda)). The commands below are running on Linux (CentOS).
+
+Create a Conda environment
+```
+conda create -yn executorch python=3.10.0
+conda activate executorch
+```
+
+Checkout ExecuTorch repo and sync submodules
+```
+git clone https://github.com/pytorch/executorch.git
+cd executorch
+git submodule sync
+git submodule update --init
+```
+Install dependencies
+```
+./install_requirements.sh
+```
+
+Optional: Use the --pybind flag to install with pybindings.
+```
+./install_requirements.sh --pybind xnnpack
+```
+
+
+## Prepare Models
+In this demo app, we support text-only inference with up-to-date Llama models and image reasoning inference with LLaVA 1.5.
+
+### For Llama model
+* You can download original model weights for Llama through Meta official [website](https://llama.meta.com/), or via Huggingface ([Llama 3.1 8B Instruction](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct))
+* For Llama 2 models, Edit params.json file. Replace "vocab_size": -1 with "vocab_size": 32000. This is a short-term workaround
+* Run `examples/models/llama2/install_requirements.sh` to install dependencies.
+* Export Llama model and generate .pte file
+
+```
+python -m examples.models.llama2.export_llama --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte"
+```
+
+You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily.
+
+* Convert tokenizer for Llama 2
+```
+python -m extension.llm.tokenizer.tokenizer -t <tokenizer.model> -o tokenizer.bin
+```
+* Convert tokenizer for Llama 3 - Rename `tokenizer.model` to `tokenizer.bin`.
+
+### For LLaVA model
+* For the Llava 1.5 model, you can get it from Huggingface [here](https://huggingface.co/llava-hf/llava-1.5-7b-hf).
+* Run `examples/models/llava/install_requirements.sh` to install dependencies.
+* Run the following command to generate llava.pte, tokenizer.bin and an image tensor (serialized in TorchScript) image.pt.
+
+```
+python -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
+```
+* You can find more information [here](https://github.com/pytorch/executorch/tree/main/examples/models/llava).
+
+
+## Pushing Model and Tokenizer
+Once you have the model and tokenizer ready, you can push them to the device before we start building the Android demo app.
+```
+adb shell mkdir -p /data/local/tmp/llama
+adb push llama.pte /data/local/tmp/llama
+adb push tokenizer.bin /data/local/tmp/llama
+```
+
+## Build AAR Library
+1. Open a terminal window and navigate to the root directory of the executorch
+2. Set the following environment variables:
+```
+export ANDROID_NDK=<path_to_android_ndk>
+export ANDROID_ABI=arm64-v8a
+```
+*Note: <path_to_android_ndk> is the root for the NDK, which is usually under ~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md. We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.*
+3. Build the Android Java extension code:
+```
+pushd extension/android
+./gradlew build
+popd
+```
+4. Run the following command set up the required JNI library:
+```
+pushd examples/demo-apps/android/LlamaDemo
+./gradlew :app:setup
+popd
+```
+Alternative you can also just run the shell script directly as in the root directory:
+```
+sh examples/demo-apps/android/LlamaDemo/setup.sh
+```
+
+This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them, and copies them to jniLibs.
+
+**Output**: The executorch-llama.aar file will be generated in a newly created folder in the example/demo-apps/android/LlamaDemo/app/libs directory. This is the path that the Android app expects it to be in.
+
+**Note**: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting on Linux), make sure you copy the aar file generated from setup script to “examples/demo-apps/android/LlamaDemo/app/libs” before building the Android app.
+
+### Alternative: Use prebuilt AAR library
+1. Open a terminal window and navigate to the root directory of the executorch.
+2. Run the following command to download the prebuilt library
+```
+bash examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh
+```
+The prebuilt AAR library contains the Java library and the JNI binding for NativePeer.java and ExecuTorch native library, including core ExecuTorch runtime libraries, XNNPACK backend, Portable kernels, Optimized kernels, and Quantized kernels. It comes with two ABI variants, arm64-v8a and x86_64.
+If you need to use other dependencies (like tokenizer), please build from the local machine option.
+
+## Run the Android Demo App
+### Alternative 1: Android Studio (Recommended)
+1. Open Android Studio and select “Open an existing Android Studio project” to open examples/demo-apps/android/LlamaDemo.
+2. Run the app (^R). This builds and launches the app on the phone.
+
+### Alternative 2: Command line
+Without Android Studio UI, we can run gradle directly to build the app. We need to set up the Android SDK path and invoke gradle.
+```
+export ANDROID_HOME=<path_to_android_sdk_home>
+pushd examples/demo-apps/android/LlamaDemo
+./gradlew :app:installDebug
+popd
+```
+If the app successfully run on your device, you should see something like below:
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/opening_the_app_details.png" style="width:800px">
+</p>
+
+## Reporting Issues
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
index c6ada79553a..df70725942d 100644
--- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
+++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
@@ -15,6 +15,9 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+  -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
   -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
   -DEXECUTORCH_BUILD_QNN=ON \
   -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
@@ -28,20 +31,14 @@ else
 fi
 cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release
 
-cmake examples/models/llama2 \
-         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-         -DANDROID_ABI="$ANDROID_ABI" \
-         -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
-         -DCMAKE_BUILD_TYPE=Release \
-         -B"${CMAKE_OUT}"/examples/models/llama2
-
-cmake --build "${CMAKE_OUT}"/examples/models/llama2 -j "${CMAKE_JOBS}" --config Release
-
 cmake extension/android \
   -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
   -DANDROID_ABI="${ANDROID_ABI}" \
   -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+  -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
   -DCMAKE_BUILD_TYPE=Release \
   -B"${CMAKE_OUT}"/extension/android
 
@@ -53,7 +50,7 @@ mkdir -p "${JNI_LIBS_PATH}/${ANDROID_ABI}"
 BUILD_AAR_DIR="$(mktemp -d)"
 mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" "${BUILD_AAR_DIR}/libs"
 JNI_LIBS_PATH="${BUILD_AAR_DIR}/jni"
-cp "${CMAKE_OUT}"/extension/android/libexecutorch_llama_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
+cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/libexecutorch.so"
 cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
 cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
 cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh
index 39a50f9b968..b89c1829944 100644
--- a/examples/demo-apps/android/LlamaDemo/setup.sh
+++ b/examples/demo-apps/android/LlamaDemo/setup.sh
@@ -8,7 +8,6 @@
 set -eu
 
 CMAKE_OUT="${CMAKE_OUT:-cmake-out-android}"
-EXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN:-OFF}"
 # Note: Set up ANDROID_NDK and ANDROID_ABI
 cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
@@ -17,6 +16,8 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
   -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
   -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
@@ -30,27 +31,14 @@ else
 fi
 cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release
 
-cmake examples/models/llama2 \
-         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-         -DANDROID_ABI="$ANDROID_ABI" \
-         -DANDROID_PLATFORM=android-23 \
-         -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
-         -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \
-         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-         -DEXECUTORCH_BUILD_XNNPACK=ON \
-         -DCMAKE_BUILD_TYPE=Release \
-         -B"${CMAKE_OUT}"/examples/models/llama2
-
-cmake --build "${CMAKE_OUT}"/examples/models/llama2 -j "${CMAKE_JOBS}" --config Release
-
 cmake extension/android \
   -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
   -DANDROID_ABI="${ANDROID_ABI}" \
   -DANDROID_PLATFORM=android-23 \
   -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
+  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+  -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
   -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
-  -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \
   -DCMAKE_BUILD_TYPE=Release \
   -B"${CMAKE_OUT}"/extension/android
 
@@ -58,7 +46,7 @@ cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Relea
 
 BUILD_AAR_DIR="$(mktemp -d)"
 mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" "${BUILD_AAR_DIR}/libs"
-cp "${CMAKE_OUT}"/extension/android/libexecutorch_llama_jni.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}"
+cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/libexecutorch.so"
 cp extension/android/build/libs/executorch.jar "${BUILD_AAR_DIR}/libs"
 echo \<manifest xmlns:android=\"http://schemas.android.com/apk/res/android\" \
   package=\"org.pytorch.executorch\"\> \
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
index cbcb03a3b72..857c5252845 100644
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
@@ -59,6 +59,13 @@
 			remoteGlobalIDString = 03C818302AC79FCD0084CC29;
 			remoteInfo = ImageClassification;
 		};
+		84EF1FE92C7850B6005922B4 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 032C01672AC228E5002955E1 /* Project object */;
+			proxyType = 1;
+			remoteGlobalIDString = 032C016E2AC228E6002955E1;
+			remoteInfo = App;
+		};
 /* End PBXContainerItemProxy section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -330,6 +337,7 @@
 			buildRules = (
 			);
 			dependencies = (
+				84EF1FEA2C7850B6005922B4 /* PBXTargetDependency */,
 			);
 			name = MobileNetClassifierTest;
 			packageProductDependencies = (
@@ -489,6 +497,11 @@
 			target = 03C818302AC79FCD0084CC29 /* ImageClassification */;
 			targetProxy = 03C818452AC7A0DB0084CC29 /* PBXContainerItemProxy */;
 		};
+		84EF1FEA2C7850B6005922B4 /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			target = 032C016E2AC228E6002955E1 /* App */;
+			targetProxy = 84EF1FE92C7850B6005922B4 /* PBXContainerItemProxy */;
+		};
 /* End PBXTargetDependency section */
 
 /* Begin XCBuildConfiguration section */
@@ -633,7 +646,7 @@
 				INFOPLIST_KEY_UIRequiresFullScreen = YES;
 				INFOPLIST_KEY_UISupportedInterfaceOrientations = UIInterfaceOrientationPortrait;
 				MARKETING_VERSION = 1.0;
-				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo;
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo.test;
 				PRODUCT_NAME = "$(PROJECT_NAME)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
@@ -661,7 +674,7 @@
 				INFOPLIST_KEY_UIRequiresFullScreen = YES;
 				INFOPLIST_KEY_UISupportedInterfaceOrientations = UIInterfaceOrientationPortrait;
 				MARKETING_VERSION = 1.0;
-				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo;
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo.test;
 				PRODUCT_NAME = "$(PROJECT_NAME)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
@@ -703,6 +716,7 @@
 				SUPPORTS_MACCATALYST = NO;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				TEST_HOST = "$(BUILT_PRODUCTS_DIR)/ExecuTorchDemo.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/ExecuTorchDemo";
 			};
 			name = Debug;
 		};
@@ -717,6 +731,7 @@
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SUPPORTS_MACCATALYST = NO;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
+				TEST_HOST = "$(BUILT_PRODUCTS_DIR)/ExecuTorchDemo.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/ExecuTorchDemo";
 			};
 			name = Release;
 		};
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm
index cc5a5c81394..59b66e510bd 100644
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm
+++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm
@@ -9,8 +9,9 @@
 #import "MobileNetClassifier.h"
 
 #import <executorch/extension/module/module.h>
+#import <executorch/extension/tensor/tensor.h>
 
-using namespace ::torch::executor;
+using namespace ::executorch::extension;
 
 NSErrorDomain const ETMobileNetClassifierErrorDomain =
     @"MobileNetClassifierErrorDomain";
@@ -33,9 +34,8 @@ - (BOOL)classifyWithInput:(float*)input
                    output:(float*)output
                outputSize:(NSInteger)outputSize
                     error:(NSError**)error {
-  int32_t sizes[] = {1, kChannels, kSize, kSize};
-  TensorImpl inputTensor(ScalarType::Float, std::size(sizes), sizes, input);
-  const auto result = _module->forward({EValue(Tensor(&inputTensor))});
+  const auto result =
+      _module->forward(from_blob(input, {1, kChannels, kSize, kSize}));
 
   if (!result.ok()) {
     if (error) {
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
index e3a74456b38..e5cf0071683 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
@@ -43,12 +43,38 @@
 		03729F132BB2042B00152F2E /* sampler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03729F112BB2042B00152F2E /* sampler.cpp */; };
 		03729F162BB2043600152F2E /* bpe_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03729F142BB2043600152F2E /* bpe_tokenizer.cpp */; };
 		03729F172BB2043600152F2E /* tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 03729F152BB2043600152F2E /* tokenizer.h */; };
+		0372C3112C893FE900CD942A /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0372C3102C893FE900CD942A /* CoreGraphics.framework */; };
+		0372C3142C89418E00CD942A /* llava_runner.h in Headers */ = {isa = PBXBuildFile; fileRef = 0372C3122C89418E00CD942A /* llava_runner.h */; };
+		0372C3152C89418E00CD942A /* llava_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0372C3132C89418E00CD942A /* llava_runner.cpp */; };
 		038D678C2C482C1E00B88CF2 /* llama_tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 038D678A2C482C1D00B88CF2 /* llama_tiktoken.cpp */; };
 		038D678D2C482C1E00B88CF2 /* llama_tiktoken.h in Headers */ = {isa = PBXBuildFile; fileRef = 038D678B2C482C1E00B88CF2 /* llama_tiktoken.h */; };
 		03BADE202BD2E88600DDFDC2 /* bpe_tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 03BADE1F2BD2E88600DDFDC2 /* bpe_tokenizer.h */; };
 		03BADE232BD2EB6700DDFDC2 /* tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03BADE212BD2EB6600DDFDC2 /* tiktoken.cpp */; };
 		03BADE242BD2EB6700DDFDC2 /* tiktoken.h in Headers */ = {isa = PBXBuildFile; fileRef = 03BADE222BD2EB6700DDFDC2 /* tiktoken.h */; };
+		03D03DA72C7823620088D6A7 /* text_prefiller.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03D03DA52C7823620088D6A7 /* text_prefiller.cpp */; };
+		03D03DA82C7823620088D6A7 /* text_prefiller.h in Headers */ = {isa = PBXBuildFile; fileRef = 03D03DA62C7823620088D6A7 /* text_prefiller.h */; };
+		03D03DAB2C7823830088D6A7 /* text_decoder_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03D03DA92C7823830088D6A7 /* text_decoder_runner.cpp */; };
+		03D03DAC2C7823830088D6A7 /* text_decoder_runner.h in Headers */ = {isa = PBXBuildFile; fileRef = 03D03DAA2C7823830088D6A7 /* text_decoder_runner.h */; };
 		03DDA0FB2BD6368100D234B3 /* base64.h in Headers */ = {isa = PBXBuildFile; fileRef = 03DDA0FA2BD6368100D234B3 /* base64.h */; };
+		26A6A4282C8A3769005A761E /* ImagePicker.swift in Sources */ = {isa = PBXBuildFile; fileRef = 26A6A4272C8A3769005A761E /* ImagePicker.swift */; };
+		84DD94742C8105EB00C765A6 /* LLaMAPerfBenchmarkApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 84DD94732C8105EB00C765A6 /* LLaMAPerfBenchmarkApp.swift */; };
+		84DD94812C81060E00C765A6 /* backend_coreml in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94802C81060E00C765A6 /* backend_coreml */; };
+		84DD94832C81060E00C765A6 /* backend_coreml_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94822C81060E00C765A6 /* backend_coreml_debug */; };
+		84DD94852C81060E00C765A6 /* backend_mps in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94842C81060E00C765A6 /* backend_mps */; };
+		84DD94872C81060E00C765A6 /* backend_mps_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94862C81060E00C765A6 /* backend_mps_debug */; };
+		84DD94892C81060E00C765A6 /* backend_xnnpack in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94882C81060E00C765A6 /* backend_xnnpack */; };
+		84DD948B2C81060E00C765A6 /* backend_xnnpack_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD948A2C81060E00C765A6 /* backend_xnnpack_debug */; };
+		84DD94912C81060E00C765A6 /* kernels_custom in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94902C81060E00C765A6 /* kernels_custom */; };
+		84DD94932C81060E00C765A6 /* kernels_custom_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94922C81060E00C765A6 /* kernels_custom_debug */; };
+		84DD94952C81060E00C765A6 /* kernels_optimized in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94942C81060E00C765A6 /* kernels_optimized */; };
+		84DD94972C81060E00C765A6 /* kernels_optimized_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94962C81060E00C765A6 /* kernels_optimized_debug */; };
+		84DD94992C81060E00C765A6 /* kernels_portable in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94982C81060E00C765A6 /* kernels_portable */; };
+		84DD949B2C81060E00C765A6 /* kernels_portable_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD949A2C81060E00C765A6 /* kernels_portable_debug */; };
+		84DD949D2C81060E00C765A6 /* kernels_quantized in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD949C2C81060E00C765A6 /* kernels_quantized */; };
+		84DD949F2C81060E00C765A6 /* kernels_quantized_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD949E2C81060E00C765A6 /* kernels_quantized_debug */; };
+		84DD94A02C81061100C765A6 /* LLaMARunner.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */; };
+		84DD94A12C81061100C765A6 /* LLaMARunner.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
+		84DD94AF2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 84DD94AE2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.swift */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXContainerItemProxy section */
@@ -59,6 +85,20 @@
 			remoteGlobalIDString = 03729ED42BB1F8DE00152F2E;
 			remoteInfo = LLaMARunner;
 		};
+		84DD94A22C81061100C765A6 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 032C01672AC228E5002955E1 /* Project object */;
+			proxyType = 1;
+			remoteGlobalIDString = 03729ED42BB1F8DE00152F2E;
+			remoteInfo = LLaMARunner;
+		};
+		84DD94B02C811E3E00C765A6 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 032C01672AC228E5002955E1 /* Project object */;
+			proxyType = 1;
+			remoteGlobalIDString = 84DD94702C8105EB00C765A6;
+			remoteInfo = LLaMAPerfBenchmark;
+		};
 /* End PBXContainerItemProxy section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -73,6 +113,17 @@
 			name = "Embed Frameworks";
 			runOnlyForDeploymentPostprocessing = 0;
 		};
+		84DD94A42C81061100C765A6 /* Embed Frameworks */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = "";
+			dstSubfolderSpec = 10;
+			files = (
+				84DD94A12C81061100C765A6 /* LLaMARunner.framework in Embed Frameworks */,
+			);
+			name = "Embed Frameworks";
+			runOnlyForDeploymentPostprocessing = 0;
+		};
 /* End PBXCopyFilesBuildPhase section */
 
 /* Begin PBXFileReference section */
@@ -92,19 +143,32 @@
 		035A5E942BB4B523001E0553 /* LLaMA.entitlements */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.entitlements; path = LLaMA.entitlements; sourceTree = "<group>"; };
 		036CAF9D2BB1444500D6C2D5 /* LLaMA.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LLaMA.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = LLaMARunner.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-		03729F072BB203B300152F2E /* runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = runner.cpp; sourceTree = "<group>"; };
-		03729F082BB203B300152F2E /* runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = runner.h; sourceTree = "<group>"; };
+		03729F072BB203B300152F2E /* runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = runner.cpp; path = ../../../examples/models/llama2/runner/runner.cpp; sourceTree = "<group>"; };
+		03729F082BB203B300152F2E /* runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = runner.h; path = ../../../examples/models/llama2/runner/runner.h; sourceTree = "<group>"; };
 		03729F092BB203B300152F2E /* util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = util.h; sourceTree = "<group>"; };
 		03729F102BB2042B00152F2E /* sampler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sampler.h; sourceTree = "<group>"; };
 		03729F112BB2042B00152F2E /* sampler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sampler.cpp; sourceTree = "<group>"; };
 		03729F142BB2043600152F2E /* bpe_tokenizer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = bpe_tokenizer.cpp; path = ../../../../extension/llm/tokenizer/bpe_tokenizer.cpp; sourceTree = "<group>"; };
 		03729F152BB2043600152F2E /* tokenizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = tokenizer.h; path = ../../../../extension/llm/tokenizer/tokenizer.h; sourceTree = "<group>"; };
+		0372C3102C893FE900CD942A /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
+		0372C3122C89418E00CD942A /* llava_runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llava_runner.h; path = ../../../examples/models/llava/runner/llava_runner.h; sourceTree = "<group>"; };
+		0372C3132C89418E00CD942A /* llava_runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llava_runner.cpp; path = ../../../examples/models/llava/runner/llava_runner.cpp; sourceTree = "<group>"; };
 		038D678A2C482C1D00B88CF2 /* llama_tiktoken.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = llama_tiktoken.cpp; sourceTree = "<group>"; };
 		038D678B2C482C1E00B88CF2 /* llama_tiktoken.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = llama_tiktoken.h; sourceTree = "<group>"; };
 		03BADE1F2BD2E88600DDFDC2 /* bpe_tokenizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = bpe_tokenizer.h; path = ../../../../extension/llm/tokenizer/bpe_tokenizer.h; sourceTree = "<group>"; };
 		03BADE212BD2EB6600DDFDC2 /* tiktoken.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = tiktoken.cpp; path = ../../../../extension/llm/tokenizer/tiktoken.cpp; sourceTree = "<group>"; };
 		03BADE222BD2EB6700DDFDC2 /* tiktoken.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = tiktoken.h; path = ../../../../extension/llm/tokenizer/tiktoken.h; sourceTree = "<group>"; };
+		03D03DA52C7823620088D6A7 /* text_prefiller.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = text_prefiller.cpp; sourceTree = "<group>"; };
+		03D03DA62C7823620088D6A7 /* text_prefiller.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = text_prefiller.h; sourceTree = "<group>"; };
+		03D03DA92C7823830088D6A7 /* text_decoder_runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = text_decoder_runner.cpp; sourceTree = "<group>"; };
+		03D03DAA2C7823830088D6A7 /* text_decoder_runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = text_decoder_runner.h; sourceTree = "<group>"; };
 		03DDA0FA2BD6368100D234B3 /* base64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = base64.h; path = ../../../../extension/llm/tokenizer/base64.h; sourceTree = "<group>"; };
+		26A6A4272C8A3769005A761E /* ImagePicker.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImagePicker.swift; sourceTree = "<group>"; };
+		84DD94712C8105EB00C765A6 /* LLaMAPerfBenchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LLaMAPerfBenchmark.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		84DD94732C8105EB00C765A6 /* LLaMAPerfBenchmarkApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LLaMAPerfBenchmarkApp.swift; sourceTree = "<group>"; };
+		84DD94A72C8107AB00C765A6 /* LLaMAPerfBenchmark.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = LLaMAPerfBenchmark.entitlements; sourceTree = "<group>"; };
+		84DD94AC2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = LLaMAPerfBenchmarkTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
+		84DD94AE2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LLaMAPerfBenchmarkTests.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -134,10 +198,40 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				0372C3112C893FE900CD942A /* CoreGraphics.framework in Frameworks */,
 				03312C3E2BBFD076002106EF /* executorch_debug in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
+		84DD946E2C8105EB00C765A6 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				84DD94932C81060E00C765A6 /* kernels_custom_debug in Frameworks */,
+				84DD94972C81060E00C765A6 /* kernels_optimized_debug in Frameworks */,
+				84DD94892C81060E00C765A6 /* backend_xnnpack in Frameworks */,
+				84DD949B2C81060E00C765A6 /* kernels_portable_debug in Frameworks */,
+				84DD949F2C81060E00C765A6 /* kernels_quantized_debug in Frameworks */,
+				84DD94812C81060E00C765A6 /* backend_coreml in Frameworks */,
+				84DD94912C81060E00C765A6 /* kernels_custom in Frameworks */,
+				84DD94852C81060E00C765A6 /* backend_mps in Frameworks */,
+				84DD94992C81060E00C765A6 /* kernels_portable in Frameworks */,
+				84DD94A02C81061100C765A6 /* LLaMARunner.framework in Frameworks */,
+				84DD94952C81060E00C765A6 /* kernels_optimized in Frameworks */,
+				84DD948B2C81060E00C765A6 /* backend_xnnpack_debug in Frameworks */,
+				84DD94872C81060E00C765A6 /* backend_mps_debug in Frameworks */,
+				84DD949D2C81060E00C765A6 /* kernels_quantized in Frameworks */,
+				84DD94832C81060E00C765A6 /* backend_coreml_debug in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		84DD94A92C811E3E00C765A6 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
 /* End PBXFrameworksBuildPhase section */
 
 /* Begin PBXGroup section */
@@ -161,6 +255,7 @@
 				0324D6862BAACB6900DEF36F /* MessageView.swift */,
 				0324D6872BAACB6900DEF36F /* ResourceManager.swift */,
 				0324D6882BAACB6900DEF36F /* ResourceMonitor.swift */,
+				26A6A4272C8A3769005A761E /* ImagePicker.swift */,
 			);
 			path = Application;
 			sourceTree = "<group>";
@@ -219,6 +314,11 @@
 				0324D69F2BAACB7C00DEF36F /* LLaMARunner */,
 				036CAF9D2BB1444500D6C2D5 /* LLaMA.app */,
 				03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */,
+				84DD94712C8105EB00C765A6 /* LLaMAPerfBenchmark.app */,
+				84DD94722C8105EB00C765A6 /* LLaMAPerfBenchmark */,
+				84DD94AD2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests */,
+				84DD947F2C81060E00C765A6 /* Frameworks */,
+				84DD94AC2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.xctest */,
 			);
 			sourceTree = "<group>";
 		};
@@ -233,12 +333,18 @@
 		03729F062BB2035900152F2E /* runner */ = {
 			isa = PBXGroup;
 			children = (
+				0372C3132C89418E00CD942A /* llava_runner.cpp */,
+				0372C3122C89418E00CD942A /* llava_runner.h */,
 				03729F072BB203B300152F2E /* runner.cpp */,
 				03729F082BB203B300152F2E /* runner.h */,
+				03D03DA92C7823830088D6A7 /* text_decoder_runner.cpp */,
+				03D03DAA2C7823830088D6A7 /* text_decoder_runner.h */,
+				03D03DA52C7823620088D6A7 /* text_prefiller.cpp */,
+				03D03DA62C7823620088D6A7 /* text_prefiller.h */,
 				03729F092BB203B300152F2E /* util.h */,
 			);
 			name = runner;
-			path = ../../../../../models/llama2/runner;
+			path = ../../../../../../extension/llm/runner;
 			sourceTree = "<group>";
 		};
 		03729F0E2BB203D700152F2E /* tokenizer */ = {
@@ -264,7 +370,32 @@
 				03729F102BB2042B00152F2E /* sampler.h */,
 			);
 			name = sampler;
-			path = ../../../../../models/llama2/sampler;
+			path = ../../../../../../extension/llm/sampler;
+			sourceTree = "<group>";
+		};
+		84DD94722C8105EB00C765A6 /* LLaMAPerfBenchmark */ = {
+			isa = PBXGroup;
+			children = (
+				84DD94A72C8107AB00C765A6 /* LLaMAPerfBenchmark.entitlements */,
+				84DD94732C8105EB00C765A6 /* LLaMAPerfBenchmarkApp.swift */,
+			);
+			path = LLaMAPerfBenchmark;
+			sourceTree = "<group>";
+		};
+		84DD947F2C81060E00C765A6 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				0372C3102C893FE900CD942A /* CoreGraphics.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		84DD94AD2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests */ = {
+			isa = PBXGroup;
+			children = (
+				84DD94AE2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.swift */,
+			);
+			path = LLaMAPerfBenchmarkTests;
 			sourceTree = "<group>";
 		};
 /* End PBXGroup section */
@@ -277,12 +408,15 @@
 				03BADE202BD2E88600DDFDC2 /* bpe_tokenizer.h in Headers */,
 				03729F172BB2043600152F2E /* tokenizer.h in Headers */,
 				03729EE22BB1F93E00152F2E /* LLaMARunner.h in Headers */,
+				03D03DA82C7823620088D6A7 /* text_prefiller.h in Headers */,
+				03D03DAC2C7823830088D6A7 /* text_decoder_runner.h in Headers */,
 				03DDA0FB2BD6368100D234B3 /* base64.h in Headers */,
 				03BADE242BD2EB6700DDFDC2 /* tiktoken.h in Headers */,
 				03729F122BB2042B00152F2E /* sampler.h in Headers */,
 				038D678D2C482C1E00B88CF2 /* llama_tiktoken.h in Headers */,
 				03729F0C2BB203B300152F2E /* util.h in Headers */,
 				03729F0B2BB203B300152F2E /* runner.h in Headers */,
+				0372C3142C89418E00CD942A /* llava_runner.h in Headers */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -346,6 +480,59 @@
 			productReference = 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */;
 			productType = "com.apple.product-type.framework";
 		};
+		84DD94702C8105EB00C765A6 /* LLaMAPerfBenchmark */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 84DD947C2C8105EC00C765A6 /* Build configuration list for PBXNativeTarget "LLaMAPerfBenchmark" */;
+			buildPhases = (
+				84DD946D2C8105EB00C765A6 /* Sources */,
+				84DD946E2C8105EB00C765A6 /* Frameworks */,
+				84DD946F2C8105EB00C765A6 /* Resources */,
+				84DD94A42C81061100C765A6 /* Embed Frameworks */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+				84DD94A32C81061100C765A6 /* PBXTargetDependency */,
+			);
+			name = LLaMAPerfBenchmark;
+			packageProductDependencies = (
+				84DD94802C81060E00C765A6 /* backend_coreml */,
+				84DD94822C81060E00C765A6 /* backend_coreml_debug */,
+				84DD94842C81060E00C765A6 /* backend_mps */,
+				84DD94862C81060E00C765A6 /* backend_mps_debug */,
+				84DD94882C81060E00C765A6 /* backend_xnnpack */,
+				84DD948A2C81060E00C765A6 /* backend_xnnpack_debug */,
+				84DD94902C81060E00C765A6 /* kernels_custom */,
+				84DD94922C81060E00C765A6 /* kernels_custom_debug */,
+				84DD94942C81060E00C765A6 /* kernels_optimized */,
+				84DD94962C81060E00C765A6 /* kernels_optimized_debug */,
+				84DD94982C81060E00C765A6 /* kernels_portable */,
+				84DD949A2C81060E00C765A6 /* kernels_portable_debug */,
+				84DD949C2C81060E00C765A6 /* kernels_quantized */,
+				84DD949E2C81060E00C765A6 /* kernels_quantized_debug */,
+			);
+			productName = LLaMAPerfBenchmark;
+			productReference = 84DD94712C8105EB00C765A6 /* LLaMAPerfBenchmark.app */;
+			productType = "com.apple.product-type.application";
+		};
+		84DD94AB2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 84DD94B22C811E3E00C765A6 /* Build configuration list for PBXNativeTarget "LLaMAPerfBenchmarkTests" */;
+			buildPhases = (
+				84DD94A82C811E3E00C765A6 /* Sources */,
+				84DD94A92C811E3E00C765A6 /* Frameworks */,
+				84DD94AA2C811E3E00C765A6 /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+				84DD94B12C811E3E00C765A6 /* PBXTargetDependency */,
+			);
+			name = LLaMAPerfBenchmarkTests;
+			productName = LLaMAPerfBenchmarkTests;
+			productReference = 84DD94AC2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.xctest */;
+			productType = "com.apple.product-type.bundle.unit-test";
+		};
 /* End PBXNativeTarget section */
 
 /* Begin PBXProject section */
@@ -353,7 +540,7 @@
 			isa = PBXProject;
 			attributes = {
 				BuildIndependentTargetsInParallel = 1;
-				LastSwiftUpdateCheck = 1500;
+				LastSwiftUpdateCheck = 1540;
 				LastUpgradeCheck = 1530;
 				TargetAttributes = {
 					032C016E2AC228E6002955E1 = {
@@ -362,6 +549,13 @@
 					03729ED42BB1F8DE00152F2E = {
 						CreatedOnToolsVersion = 15.3;
 					};
+					84DD94702C8105EB00C765A6 = {
+						CreatedOnToolsVersion = 15.4;
+					};
+					84DD94AB2C811E3E00C765A6 = {
+						CreatedOnToolsVersion = 15.4;
+						TestTargetID = 84DD94702C8105EB00C765A6;
+					};
 				};
 			};
 			buildConfigurationList = 032C016A2AC228E5002955E1 /* Build configuration list for PBXProject "LLaMA" */;
@@ -382,6 +576,8 @@
 			targets = (
 				032C016E2AC228E6002955E1 /* LLaMA */,
 				03729ED42BB1F8DE00152F2E /* LLaMARunner */,
+				84DD94702C8105EB00C765A6 /* LLaMAPerfBenchmark */,
+				84DD94AB2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests */,
 			);
 		};
 /* End PBXProject section */
@@ -402,6 +598,20 @@
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
+		84DD946F2C8105EB00C765A6 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		84DD94AA2C811E3E00C765A6 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
 /* End PBXResourcesBuildPhase section */
 
 /* Begin PBXShellScriptBuildPhase section */
@@ -438,6 +648,7 @@
 				0324D6922BAACB6900DEF36F /* ResourceManager.swift in Sources */,
 				0324D68C2BAACB6900DEF36F /* ContentView.swift in Sources */,
 				0324D6902BAACB6900DEF36F /* MessageListView.swift in Sources */,
+				26A6A4282C8A3769005A761E /* ImagePicker.swift in Sources */,
 				0324D6912BAACB6900DEF36F /* MessageView.swift in Sources */,
 				0324D68B2BAACB6900DEF36F /* App.swift in Sources */,
 			);
@@ -450,9 +661,28 @@
 				03729EE12BB1F93800152F2E /* LLaMARunner.mm in Sources */,
 				03BADE232BD2EB6700DDFDC2 /* tiktoken.cpp in Sources */,
 				038D678C2C482C1E00B88CF2 /* llama_tiktoken.cpp in Sources */,
+				0372C3152C89418E00CD942A /* llava_runner.cpp in Sources */,
+				03D03DAB2C7823830088D6A7 /* text_decoder_runner.cpp in Sources */,
 				03729F162BB2043600152F2E /* bpe_tokenizer.cpp in Sources */,
 				03729F0A2BB203B300152F2E /* runner.cpp in Sources */,
 				03729F132BB2042B00152F2E /* sampler.cpp in Sources */,
+				03D03DA72C7823620088D6A7 /* text_prefiller.cpp in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		84DD946D2C8105EB00C765A6 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				84DD94742C8105EB00C765A6 /* LLaMAPerfBenchmarkApp.swift in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		84DD94A82C811E3E00C765A6 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				84DD94AF2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.swift in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -464,6 +694,16 @@
 			target = 03729ED42BB1F8DE00152F2E /* LLaMARunner */;
 			targetProxy = 03729ED92BB1F8DE00152F2E /* PBXContainerItemProxy */;
 		};
+		84DD94A32C81061100C765A6 /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			target = 03729ED42BB1F8DE00152F2E /* LLaMARunner */;
+			targetProxy = 84DD94A22C81061100C765A6 /* PBXContainerItemProxy */;
+		};
+		84DD94B12C811E3E00C765A6 /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			target = 84DD94702C8105EB00C765A6 /* LLaMAPerfBenchmark */;
+			targetProxy = 84DD94B02C811E3E00C765A6 /* PBXContainerItemProxy */;
+		};
 /* End PBXTargetDependency section */
 
 /* Begin XCBuildConfiguration section */
@@ -624,7 +864,7 @@
 				OTHER_LDFLAGS = "";
 				"OTHER_LDFLAGS[sdk=iphoneos*]" = (
 					"-force_load",
-					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-debug.a",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_optimized-ios-debug.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-debug.a",
 					"-force_load",
@@ -638,7 +878,7 @@
 				);
 				"OTHER_LDFLAGS[sdk=iphonesimulator*]" = (
 					"-force_load",
-					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-debug.a",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_optimized-simulator-debug.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-debug.a",
 					"-force_load",
@@ -686,7 +926,7 @@
 				OTHER_LDFLAGS = "";
 				"OTHER_LDFLAGS[sdk=iphoneos*]" = (
 					"-force_load",
-					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-release.a",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_optimized-ios-release.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-release.a",
 					"-force_load",
@@ -700,7 +940,7 @@
 				);
 				"OTHER_LDFLAGS[sdk=iphonesimulator*]" = (
 					"-force_load",
-					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-release.a",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_optimized-simulator-release.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a",
 					"-force_load",
@@ -831,6 +1071,168 @@
 			};
 			name = Release;
 		};
+		84DD947D2C8105EC00C765A6 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CODE_SIGN_ENTITLEMENTS = LLaMAPerfBenchmark/LLaMAPerfBenchmark.entitlements;
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_ASSET_PATHS = "";
+				ENABLE_PREVIEWS = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GENERATE_INFOPLIST_FILE = YES;
+				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
+				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				MARKETING_VERSION = 1.0;
+				"OTHER_LDFLAGS[sdk=iphoneos*]" = (
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_optimized-ios-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_quantized-ios-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-ios-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libbackend_coreml-ios-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libbackend_mps-ios-debug.a",
+				);
+				"OTHER_LDFLAGS[sdk=iphonesimulator*]" = (
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_optimized-simulator-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-debug.a",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.illama;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		84DD947E2C8105EC00C765A6 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CODE_SIGN_ENTITLEMENTS = LLaMAPerfBenchmark/LLaMAPerfBenchmark.entitlements;
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_ASSET_PATHS = "";
+				ENABLE_PREVIEWS = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GENERATE_INFOPLIST_FILE = YES;
+				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
+				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				MARKETING_VERSION = 1.0;
+				"OTHER_LDFLAGS[sdk=iphoneos*]" = (
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_quantized-ios-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-ios-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libbackend_coreml-ios-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libbackend_mps-ios-debug.a",
+				);
+				"OTHER_LDFLAGS[sdk=iphonesimulator*]" = (
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-debug.a",
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-debug.a",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.illama;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+		84DD94B32C811E3E00C765A6 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GENERATE_INFOPLIST_FILE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.illama;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_EMIT_LOC_STRINGS = NO;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+				TEST_HOST = "$(BUILT_PRODUCTS_DIR)/LLaMAPerfBenchmark.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/LLaMAPerfBenchmark";
+			};
+			name = Debug;
+		};
+		84DD94B42C811E3E00C765A6 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GENERATE_INFOPLIST_FILE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.illama;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_EMIT_LOC_STRINGS = NO;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+				TEST_HOST = "$(BUILT_PRODUCTS_DIR)/LLaMAPerfBenchmark.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/LLaMAPerfBenchmark";
+			};
+			name = Release;
+		};
 /* End XCBuildConfiguration section */
 
 /* Begin XCConfigurationList section */
@@ -861,6 +1263,24 @@
 			defaultConfigurationIsVisible = 0;
 			defaultConfigurationName = Release;
 		};
+		84DD947C2C8105EC00C765A6 /* Build configuration list for PBXNativeTarget "LLaMAPerfBenchmark" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				84DD947D2C8105EC00C765A6 /* Debug */,
+				84DD947E2C8105EC00C765A6 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		84DD94B22C811E3E00C765A6 /* Build configuration list for PBXNativeTarget "LLaMAPerfBenchmarkTests" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				84DD94B32C811E3E00C765A6 /* Debug */,
+				84DD94B42C811E3E00C765A6 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
 /* End XCConfigurationList section */
 
 /* Begin XCRemoteSwiftPackageReference section */
@@ -950,6 +1370,76 @@
 			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
 			productName = executorch_debug;
 		};
+		84DD94802C81060E00C765A6 /* backend_coreml */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = backend_coreml;
+		};
+		84DD94822C81060E00C765A6 /* backend_coreml_debug */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = backend_coreml_debug;
+		};
+		84DD94842C81060E00C765A6 /* backend_mps */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = backend_mps;
+		};
+		84DD94862C81060E00C765A6 /* backend_mps_debug */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = backend_mps_debug;
+		};
+		84DD94882C81060E00C765A6 /* backend_xnnpack */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = backend_xnnpack;
+		};
+		84DD948A2C81060E00C765A6 /* backend_xnnpack_debug */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = backend_xnnpack_debug;
+		};
+		84DD94902C81060E00C765A6 /* kernels_custom */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = kernels_custom;
+		};
+		84DD94922C81060E00C765A6 /* kernels_custom_debug */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = kernels_custom_debug;
+		};
+		84DD94942C81060E00C765A6 /* kernels_optimized */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = kernels_optimized;
+		};
+		84DD94962C81060E00C765A6 /* kernels_optimized_debug */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = kernels_optimized_debug;
+		};
+		84DD94982C81060E00C765A6 /* kernels_portable */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = kernels_portable;
+		};
+		84DD949A2C81060E00C765A6 /* kernels_portable_debug */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = kernels_portable_debug;
+		};
+		84DD949C2C81060E00C765A6 /* kernels_quantized */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = kernels_quantized;
+		};
+		84DD949E2C81060E00C765A6 /* kernels_quantized_debug */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = kernels_quantized_debug;
+		};
 /* End XCSwiftPackageProductDependency section */
 	};
 	rootObject = 032C01672AC228E5002955E1 /* Project object */;
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
index 02f28b88b81..a4fe82b138f 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
@@ -13,6 +13,73 @@ import LLaMARunner
 
 class RunnerHolder: ObservableObject {
   var runner: Runner?
+  var llavaRunner: LLaVARunner?
+}
+
+extension UIImage {
+  func resized(to newSize: CGSize) -> UIImage {
+    let format = UIGraphicsImageRendererFormat.default()
+    let renderer = UIGraphicsImageRenderer(size: newSize, format: format)
+    let image = renderer.image { _ in
+      draw(in: CGRect(origin: .zero, size: newSize))
+    }
+    return image
+  }
+
+  func toRGBArray() -> [UInt8]? {
+    guard let cgImage = self.cgImage else {
+      NSLog("Failed to get CGImage from UIImage")
+      return nil
+    }
+
+    let width = cgImage.width
+    let height = cgImage.height
+    let colorSpace = CGColorSpaceCreateDeviceRGB()
+    let bytesPerPixel = 4
+    let bytesPerRow = bytesPerPixel * width
+    let bitsPerComponent = 8
+    let bitmapInfo = CGImageAlphaInfo.premultipliedLast.rawValue
+
+    guard let context = CGContext(
+      data: nil,
+      width: width,
+      height: height,
+      bitsPerComponent: bitsPerComponent,
+      bytesPerRow: bytesPerRow,
+      space: colorSpace,
+      bitmapInfo: bitmapInfo
+    ) else {
+      NSLog("Failed to create CGContext")
+      return nil
+    }
+
+    context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height))
+
+    guard let pixelBuffer = context.data else {
+      NSLog("Failed to get pixel data from CGContext")
+      return nil
+    }
+
+    let pixelData = pixelBuffer.bindMemory(to: UInt8.self, capacity: width * height * bytesPerPixel)
+
+    var rgbArray = [UInt8](repeating: 0, count: width * height * 3)
+
+    for y in 0..<height {
+      for x in 0..<width {
+        let pixelIndex = (y * width + x) * bytesPerPixel
+        let r = UInt8(pixelData[pixelIndex])
+        let g = UInt8(pixelData[pixelIndex + 1])
+        let b = UInt8(pixelData[pixelIndex + 2])
+
+        let rgbIndex = (y * width + x)
+        rgbArray[rgbIndex] = r
+        rgbArray[rgbIndex + height * width] = g
+        rgbArray[rgbIndex + 2 * height * width] = b
+      }
+    }
+
+    return rgbArray
+  }
 }
 
 struct ContentView: View {
@@ -22,12 +89,19 @@ struct ContentView: View {
   @State private var pickerType: PickerType?
   @State private var isGenerating = false
   @State private var shouldStopGenerating = false
+  @State private var shouldStopShowingToken = false
   private let runnerQueue = DispatchQueue(label: "org.pytorch.executorch.llama")
   @StateObject private var runnerHolder = RunnerHolder()
   @StateObject private var resourceManager = ResourceManager()
   @StateObject private var resourceMonitor = ResourceMonitor()
   @StateObject private var logManager = LogManager()
 
+  @State private var isImagePickerPresented = false
+  @State private var selectedImage: UIImage?
+  @State private var imagePickerSourceType: UIImagePickerController.SourceType = .photoLibrary
+
+  @State private var showingSettings = false
+
   enum PickerType {
     case model
     case tokenizer
@@ -54,6 +128,23 @@ struct ContentView: View {
   var body: some View {
     NavigationView {
       VStack {
+        if showingSettings {
+          VStack(spacing: 20) {
+            Form {
+              Section(header: Text("Model and Tokenizer")
+                        .font(.headline)
+                        .foregroundColor(.primary)) {
+                Button(action: { pickerType = .model }) {
+                  Label(resourceManager.modelName == "" ? modelTitle : resourceManager.modelName, systemImage: "doc")
+                }
+                Button(action: { pickerType = .tokenizer }) {
+                  Label(resourceManager.tokenizerName == "" ? tokenizerTitle : resourceManager.tokenizerName, systemImage: "doc")
+                }
+              }
+            }
+          }
+        }
+
         MessageListView(messages: $messages)
           .gesture(
             DragGesture().onChanged { value in
@@ -63,24 +154,33 @@ struct ContentView: View {
             }
           )
         HStack {
-          Menu {
-            Section(header: Text("Model")) {
-              Button(action: { pickerType = .model }) {
-                Label(modelTitle, systemImage: "doc")
-              }
-            }
-            Section(header: Text("Tokenizer")) {
-              Button(action: { pickerType = .tokenizer }) {
-                Label(tokenizerTitle, systemImage: "doc")
-              }
+          Button(action: {
+            imagePickerSourceType = .photoLibrary
+            isImagePickerPresented = true
+          }) {
+            Image(systemName: "photo.on.rectangle")
+              .resizable()
+              .scaledToFit()
+              .frame(width: 24, height: 24)
+          }
+          .background(Color.clear)
+          .cornerRadius(8)
+
+          Button(action: {
+            if UIImagePickerController.isSourceTypeAvailable(.camera) {
+              imagePickerSourceType = .camera
+              isImagePickerPresented = true
+            } else {
+              print("Camera not available")
             }
-          } label: {
-            Image(systemName: "ellipsis.circle")
+          }) {
+            Image(systemName: "camera")
               .resizable()
-              .aspectRatio(contentMode: .fit)
-              .frame(height: 28)
+              .scaledToFit()
+              .frame(width: 24, height: 24)
           }
-          .disabled(isGenerating)
+          .background(Color.clear)
+          .cornerRadius(8)
 
           TextField(placeholder, text: $prompt, axis: .vertical)
             .padding(8)
@@ -102,8 +202,18 @@ struct ContentView: View {
           .disabled(isGenerating ? shouldStopGenerating : (!isInputEnabled || prompt.isEmpty))
         }
         .padding([.leading, .trailing, .bottom], 10)
+        .sheet(isPresented: $isImagePickerPresented, onDismiss: addSelectedImageMessage) {
+          ImagePicker(selectedImage: $selectedImage, sourceType: imagePickerSourceType)
+        }
       }
       .navigationBarTitle(title, displayMode: .inline)
+      .navigationBarItems(leading:
+                            Button(action: {
+                              showingSettings.toggle()
+                            }) {
+                              Image(systemName: "gearshape")
+                                .imageScale(.large)
+                            })
       .navigationBarItems(trailing:
                             HStack {
                               Menu {
@@ -153,18 +263,26 @@ struct ContentView: View {
     .navigationViewStyle(StackNavigationViewStyle())
   }
 
+  private func addSelectedImageMessage() {
+    if let selectedImage {
+      messages.append(Message(image: selectedImage))
+    }
+  }
+
   private func generate() {
     guard !prompt.isEmpty else { return }
     isGenerating = true
     shouldStopGenerating = false
-    let text = prompt.trimmingCharacters(in: .whitespacesAndNewlines)
-    let seq_len = 128
-    prompt = ""
+    shouldStopShowingToken = false
+    let text = prompt
+    let seq_len = 768 // text: 256, vision: 768
     let modelPath = resourceManager.modelPath
     let tokenizerPath = resourceManager.tokenizerPath
+    let useLlama = modelPath.range(of: "llama", options: .caseInsensitive) != nil
 
-    messages.append(Message(text: text))
-    messages.append(Message(type: .generated))
+    prompt = ""
+    hideKeyboard()
+    showingSettings = false
 
     runnerQueue.async {
       defer {
@@ -172,36 +290,82 @@ struct ContentView: View {
           isGenerating = false
         }
       }
-      runnerHolder.runner = runnerHolder.runner ?? Runner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+
+      if useLlama {
+        runnerHolder.runner = runnerHolder.runner ?? Runner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+      } else {
+        runnerHolder.llavaRunner = runnerHolder.llavaRunner ?? LLaVARunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+      }
+
       guard !shouldStopGenerating else { return }
-      if let runner = runnerHolder.runner, !runner.isloaded() {
-        var error: Error?
-        let startLoadTime = Date()
-        do {
-          try runner.load()
-        } catch let loadError {
-          error = loadError
-        }
-        let loadTime = Date().timeIntervalSince(startLoadTime)
-        DispatchQueue.main.async {
-          withAnimation {
-            var message = messages.removeLast()
-            message.type = .info
-            if let error {
-              message.text = "Model loading failed: error \((error as NSError).code)"
-            } else {
-              message.text = "Model loaded in \(String(format: "%.2f", loadTime)) s"
-            }
-            messages.append(message)
-            if error == nil {
-              messages.append(Message(type: .generated))
+      if useLlama {
+        messages.append(Message(text: text))
+        messages.append(Message(type: .llamagenerated))
+
+        if let runner = runnerHolder.runner, !runner.isloaded() {
+          var error: Error?
+          let startLoadTime = Date()
+          do {
+            try runner.load()
+          } catch let loadError {
+            error = loadError
+          }
+
+          let loadTime = Date().timeIntervalSince(startLoadTime)
+          DispatchQueue.main.async {
+            withAnimation {
+              var message = messages.removeLast()
+              message.type = .info
+              if let error {
+                message.text = "Model loading failed: error \((error as NSError).code)"
+              } else {
+                message.text = "Model loaded in \(String(format: "%.2f", loadTime)) s"
+              }
+              messages.append(message)
+              if error == nil {
+                messages.append(Message(type: .llamagenerated))
+              }
             }
           }
+          if error != nil {
+            return
+          }
         }
-        if error != nil {
-          return
+      } else {
+        messages.append(Message(text: text))
+        messages.append(Message(type: .llavagenerated))
+
+        if let runner = runnerHolder.llavaRunner, !runner.isloaded() {
+          var error: Error?
+          let startLoadTime = Date()
+          do {
+            try runner.load()
+          } catch let loadError {
+            error = loadError
+          }
+
+          let loadTime = Date().timeIntervalSince(startLoadTime)
+          DispatchQueue.main.async {
+            withAnimation {
+              var message = messages.removeLast()
+              message.type = .info
+              if let error {
+                message.text = "Model loading failed: error \((error as NSError).code)"
+              } else {
+                message.text = "Model loaded in \(String(format: "%.2f", loadTime)) s"
+              }
+              messages.append(message)
+              if error == nil {
+                messages.append(Message(type: .llavagenerated))
+              }
+            }
+          }
+          if error != nil {
+            return
+          }
         }
       }
+
       guard !shouldStopGenerating else {
         DispatchQueue.main.async {
           withAnimation {
@@ -212,24 +376,79 @@ struct ContentView: View {
       }
       do {
         var tokens: [String] = []
-        try runnerHolder.runner?.generate(text, sequenceLength: seq_len) { token in
-          tokens.append(token)
-          if tokens.count > 2 {
-            let text = tokens.joined()
-            let count = tokens.count
-            tokens = []
-            DispatchQueue.main.async {
-              withAnimation {
-                var message = messages.removeLast()
-                message.text += text
-                message.tokenCount += count
-                message.dateUpdated = Date()
-                messages.append(message)
+        var rgbArray: [UInt8]?
+        let MAX_WIDTH = 336.0
+        var newHeight = 0.0
+        var imageBuffer: UnsafeMutableRawPointer?
+
+        if let img = selectedImage {
+          let llava_prompt = "\(text) ASSISTANT"
+
+          newHeight = MAX_WIDTH * img.size.height / img.size.width
+          let resizedImage = img.resized(to: CGSize(width: MAX_WIDTH, height: newHeight))
+          rgbArray = resizedImage.toRGBArray()
+          imageBuffer = UnsafeMutableRawPointer(mutating: rgbArray)
+
+          try runnerHolder.llavaRunner?.generate(imageBuffer!, width: MAX_WIDTH, height: newHeight, prompt: llava_prompt, sequenceLength: seq_len) { token in
+
+            if token != llava_prompt {
+              if token == "</s>" {
+                shouldStopGenerating = true
+                runnerHolder.runner?.stop()
+              } else {
+                tokens.append(token)
+                if tokens.count > 2 {
+                  let text = tokens.joined()
+                  let count = tokens.count
+                  tokens = []
+                  DispatchQueue.main.async {
+                    withAnimation {
+                      var message = messages.removeLast()
+                      message.text += text
+                      message.tokenCount += count
+                      message.dateUpdated = Date()
+                      messages.append(message)
+                    }
+                  }
+                }
+                if shouldStopGenerating {
+                  runnerHolder.runner?.stop()
+                }
               }
             }
           }
-          if shouldStopGenerating {
-            runnerHolder.runner?.stop()
+        } else {
+          let llama3_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\(text)<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+
+          try runnerHolder.runner?.generate(llama3_prompt, sequenceLength: seq_len) { token in
+
+            NSLog(">>> token={\(token)}")
+            if token != llama3_prompt && !shouldStopShowingToken {
+              // hack to fix the issue that extension/llm/runner/text_token_generator.h
+              // keeps generating after <|eot_id|>
+              if token == "<|eot_id|>" {
+                shouldStopShowingToken = true
+              } else {
+                tokens.append(token.trimmingCharacters(in: .newlines))
+                if tokens.count > 2 {
+                  let text = tokens.joined()
+                  let count = tokens.count
+                  tokens = []
+                  DispatchQueue.main.async {
+                    withAnimation {
+                      var message = messages.removeLast()
+                      message.text += text
+                      message.tokenCount += count
+                      message.dateUpdated = Date()
+                      messages.append(message)
+                    }
+                  }
+                }
+                if shouldStopGenerating {
+                  runnerHolder.runner?.stop()
+                }
+              }
+            }
           }
         }
       } catch {
@@ -284,3 +503,9 @@ struct ContentView: View {
     }
   }
 }
+
+extension View {
+  func hideKeyboard() {
+    UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ImagePicker.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ImagePicker.swift
new file mode 100644
index 00000000000..57d71f6686d
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ImagePicker.swift
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+import SwiftUI
+import UIKit
+
+struct ImagePicker: UIViewControllerRepresentable {
+  class Coordinator: NSObject, UINavigationControllerDelegate, UIImagePickerControllerDelegate {
+    let parent: ImagePicker
+
+    init(parent: ImagePicker) {
+      self.parent = parent
+    }
+
+    func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [UIImagePickerController.InfoKey: Any]) {
+      if let image = info[.originalImage] as? UIImage {
+        parent.selectedImage = image
+      }
+
+      parent.presentationMode.wrappedValue.dismiss()
+    }
+
+    func imagePickerControllerDidCancel(_ picker: UIImagePickerController) {
+      parent.selectedImage = nil
+      parent.presentationMode.wrappedValue.dismiss()
+    }
+  }
+
+  @Environment(\.presentationMode) var presentationMode
+  @Binding var selectedImage: UIImage?
+  var sourceType: UIImagePickerController.SourceType = .photoLibrary
+
+  func makeCoordinator() -> Coordinator {
+    Coordinator(parent: self)
+  }
+
+  func makeUIViewController(context: Context) -> UIImagePickerController {
+    let picker = UIImagePickerController()
+    picker.delegate = context.coordinator
+    picker.sourceType = sourceType
+    return picker
+  }
+
+  func updateUIViewController(_ uiViewController: UIImagePickerController, context: Context) {}
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/Message.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/Message.swift
index b7e1b88c6aa..400941f496a 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/Message.swift
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/Message.swift
@@ -6,11 +6,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-import Foundation
+import UIKit
 
 enum MessageType {
   case prompted
-  case generated
+  case llamagenerated
+  case llavagenerated
   case info
 }
 
@@ -21,4 +22,5 @@ struct Message: Identifiable, Equatable {
   var type: MessageType = .prompted
   var text = ""
   var tokenCount = 0
+  var image: UIImage?
 }
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageView.swift
index e9ebbe953a2..542a88377b7 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageView.swift
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageView.swift
@@ -19,21 +19,34 @@ struct MessageView: View {
           .foregroundColor(.secondary)
           .padding([.leading, .trailing], 10)
       } else {
-        VStack(alignment: message.type == .generated ? .leading : .trailing) {
-          Text(message.type == .generated ? "LLaMA" : "Prompt")
-            .font(.caption)
-            .foregroundColor(.secondary)
-            .padding(message.type == .generated ? .trailing : .leading, 20)
+        VStack(alignment: message.type == .llamagenerated || message.type == .llavagenerated ? .leading : .trailing) {
+          if message.type == .llamagenerated || message.type == .llavagenerated || message.type == .prompted {
+            Text(message.type == .llamagenerated ? "Llama" : (message.type == .llavagenerated ? "Llava" : "Prompt"))
+              .font(.caption)
+              .foregroundColor(.secondary)
+              .padding(message.type == .llamagenerated || message.type == .llavagenerated ? .trailing : .leading, 20)
+          }
           HStack {
-            if message.type != .generated { Spacer() }
+            if message.type != .llamagenerated && message.type != .llavagenerated { Spacer() }
             if message.text.isEmpty {
-              ProgressView()
-                .progressViewStyle(CircularProgressViewStyle())
+              if let img = message.image {
+                Image(uiImage: img)
+                  .resizable()
+                  .scaledToFit()
+                  .frame(maxWidth: 200, maxHeight: 200)
+                  .padding()
+                  .background(Color.gray.opacity(0.2))
+                  .cornerRadius(8)
+                  .padding(.vertical, 2)
+              } else {
+                ProgressView()
+                  .progressViewStyle(CircularProgressViewStyle())
+              }
             } else {
               Text(message.text)
                 .padding(10)
-                .foregroundColor(message.type == .generated ? .primary : .white)
-                .background(message.type == .generated ? Color(UIColor.secondarySystemBackground) : Color.blue)
+                .foregroundColor(message.type == .llamagenerated || message.type == .llavagenerated ? .primary : .white)
+                .background(message.type == .llamagenerated || message.type == .llavagenerated ? Color(UIColor.secondarySystemBackground) : Color.blue)
                 .cornerRadius(20)
                 .contextMenu {
                   Button(action: {
@@ -44,14 +57,14 @@ struct MessageView: View {
                   }
                 }
             }
-            if message.type == .generated { Spacer() }
+            if message.type == .llamagenerated || message.type == .llavagenerated { Spacer() }
           }
           let elapsedTime = message.dateUpdated.timeIntervalSince(message.dateCreated)
           if elapsedTime > 0 && message.type != .info {
             Text(String(format: "%.1f t/s", Double(message.tokenCount) / elapsedTime))
               .font(.caption)
               .foregroundColor(.secondary)
-              .padding(message.type == .generated ? .trailing : .leading, 20)
+              .padding(message.type == .llamagenerated || message.type == .llavagenerated ? .trailing : .leading, 20)
           }
         }.padding([.leading, .trailing], message.type == .info ? 0 : 10)
       }
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmark.entitlements b/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmark.entitlements
new file mode 100644
index 00000000000..99f471672d6
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmark.entitlements
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>com.apple.developer.kernel.increased-memory-limit</key>
+	<true/>
+</dict>
+</plist>
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmarkApp.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmarkApp.swift
new file mode 100644
index 00000000000..b09321f1555
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmarkApp.swift
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+import SwiftUI
+
+@main
+struct LLaMAPerfBenchmarkApp: SwiftUI.App {
+  var body: some Scene {
+    WindowGroup { }
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmarkTests/LLaMAPerfBenchmarkTests.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmarkTests/LLaMAPerfBenchmarkTests.swift
new file mode 100644
index 00000000000..fa798f019de
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmarkTests/LLaMAPerfBenchmarkTests.swift
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+import LLaMARunner
+import XCTest
+
+final class LLaMAPerfBenchmarkTests: XCTestCase {
+  func testLlama2() throws {
+    guard
+      let modelPath = Bundle.main.path(
+        forResource: "llama2",
+        ofType: "pte",
+        inDirectory: "aatp/data"
+      )
+    else {
+      XCTFail("Failed to get model path")
+      return
+    }
+
+    guard
+      let tokenizerPath = Bundle.main.path(
+        forResource: "tokenizer",
+        ofType: "bin",
+        inDirectory: "aatp/data"
+      )
+    else {
+      XCTFail("Failed to get tokenizer path")
+      return
+    }
+
+    let runner = Runner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+    do {
+      try runner.load()
+    } catch let loadError {
+      XCTFail("Failed to load the model: \(loadError)")
+    }
+    XCTAssertTrue(runner.isloaded())
+
+    let seq_len = 128
+    var tokens: [String] = []
+    try runner.generate("How do you do! I'm testing llama2 on mobile device", sequenceLength: seq_len) { token in
+      tokens.append(token)
+    }
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h
index b34fa0fe605..c4edea2af93 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h
@@ -6,11 +6,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#import <Foundation/Foundation.h>
+#import <UIKit/UIKit.h>
 
 NS_ASSUME_NONNULL_BEGIN
 
 FOUNDATION_EXPORT NSErrorDomain const LLaMARunnerErrorDomain;
+FOUNDATION_EXPORT NSErrorDomain const LLaVARunnerErrorDomain;
 
 NS_SWIFT_NAME(Runner)
 @interface LLaMARunner : NSObject
@@ -30,4 +31,25 @@ NS_SWIFT_NAME(Runner)
 
 @end
 
+NS_SWIFT_NAME(LLaVARunner)
+@interface LLaVARunner : NSObject
+
+- (instancetype)initWithModelPath:(NSString*)filePath
+                    tokenizerPath:(NSString*)tokenizerPath;
+- (BOOL)isloaded;
+- (BOOL)loadWithError:(NSError**)error;
+- (BOOL)generate:(void*)imageBuffer
+                width:(CGFloat)width
+               height:(CGFloat)height
+               prompt:(NSString*)prompt
+       sequenceLength:(NSInteger)seq_len
+    withTokenCallback:(nullable void (^)(NSString*))callback
+                error:(NSError**)error;
+- (void)stop;
+
++ (instancetype)new NS_UNAVAILABLE;
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
 NS_ASSUME_NONNULL_END
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
index 07b6dce0aa8..3136d2745fd 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
@@ -10,16 +10,19 @@
 
 #import <ExecuTorch/ExecuTorchLog.h>
 #import <executorch/examples/models/llama2/runner/runner.h>
+#import <executorch/examples/models/llava/runner/llava_runner.h>
 
-using namespace ::torch::executor;
+using executorch::extension::llm::Image;
+using executorch::runtime::Error;
 
 NSErrorDomain const LLaMARunnerErrorDomain = @"LLaMARunnerErrorDomain";
+NSErrorDomain const LLaVARunnerErrorDomain = @"LLaVARunnerErrorDomain";
 
 @interface LLaMARunner ()<ExecuTorchLogSink>
 @end
 
 @implementation LLaMARunner {
-  std::unique_ptr<Runner> _runner;
+  std::unique_ptr<example::Runner> _runner;
 }
 
 - (instancetype)initWithModelPath:(NSString*)modelPath
@@ -27,7 +30,7 @@ - (instancetype)initWithModelPath:(NSString*)modelPath
   self = [super init];
   if (self) {
     [ExecuTorchLog.sharedLog addSink:self];
-    _runner = std::make_unique<Runner>(
+    _runner = std::make_unique<example::Runner>(
         modelPath.UTF8String, tokenizerPath.UTF8String);
   }
   return self;
@@ -102,3 +105,101 @@ - (void)logWithLevel:(ExecuTorchLogLevel)level
 }
 
 @end
+
+@interface LLaVARunner ()<ExecuTorchLogSink>
+@end
+
+@implementation LLaVARunner {
+  std::unique_ptr<example::LlavaRunner> _runner;
+}
+
+- (instancetype)initWithModelPath:(NSString*)modelPath
+                    tokenizerPath:(NSString*)tokenizerPath {
+  self = [super init];
+  if (self) {
+    [ExecuTorchLog.sharedLog addSink:self];
+    _runner = std::make_unique<example::LlavaRunner>(
+        modelPath.UTF8String, tokenizerPath.UTF8String);
+  }
+  return self;
+}
+
+- (void)dealloc {
+  [ExecuTorchLog.sharedLog removeSink:self];
+}
+
+- (BOOL)isloaded {
+  return _runner->is_loaded();
+}
+
+- (BOOL)loadWithError:(NSError**)error {
+  const auto status = _runner->load();
+  if (status != Error::Ok) {
+    if (error) {
+      *error = [NSError errorWithDomain:LLaVARunnerErrorDomain
+                                   code:(NSInteger)status
+                               userInfo:nil];
+    }
+    return NO;
+  }
+  return YES;
+}
+
+- (BOOL)generate:(void*)imageBuffer
+                width:(CGFloat)width
+               height:(CGFloat)height
+               prompt:(NSString*)prompt
+       sequenceLength:(NSInteger)seq_len
+    withTokenCallback:(nullable void (^)(NSString*))callback
+                error:(NSError**)error {
+  const auto* data = static_cast<uint8_t*>(imageBuffer);
+  const auto status = _runner->generate(
+      {Image{
+          std::vector<uint8_t>(
+              data, data + (int32_t)width * (int32_t)height * 3),
+          (int32_t)width,
+          (int32_t)height,
+          3}},
+      prompt.UTF8String,
+      seq_len,
+      [callback](const std::string& token) { callback(@(token.c_str())); });
+  if (status != Error::Ok) {
+    if (error) {
+      *error = [NSError errorWithDomain:LLaMARunnerErrorDomain
+                                   code:(NSInteger)status
+                               userInfo:nil];
+      return NO;
+    }
+  }
+  return YES;
+}
+
+- (void)stop {
+  _runner->stop();
+}
+
+#pragma mark - ExecuTorchLogSink
+
+- (void)logWithLevel:(ExecuTorchLogLevel)level
+           timestamp:(NSTimeInterval)timestamp
+            filename:(NSString*)filename
+                line:(NSUInteger)line
+             message:(NSString*)message {
+  NSUInteger totalSeconds = (NSUInteger)timestamp;
+  NSUInteger hours = (totalSeconds / 3600) % 24;
+  NSUInteger minutes = (totalSeconds / 60) % 60;
+  NSUInteger seconds = totalSeconds % 60;
+  NSUInteger microseconds = (timestamp - totalSeconds) * 1000000;
+  NSLog(
+      @"%c %02lu:%02lu:%02lu.%06lu executorch:%s:%zu] %s",
+      (char)level,
+      hours,
+      minutes,
+      seconds,
+      microseconds,
+      filename.UTF8String,
+      line,
+      message.UTF8String);
+}
+
+@end
diff --git a/examples/demo-apps/apple_ios/LLaMA/README.md b/examples/demo-apps/apple_ios/LLaMA/README.md
index ddd542a0066..293963b6c2a 100644
--- a/examples/demo-apps/apple_ios/LLaMA/README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/README.md
@@ -1,52 +1,96 @@
-# Building ExecuTorch LLaMA iOS Demo App
+# ExecuTorch Llama iOS Demo App
 
-This app demonstrates the use of the LLaMA chat app demonstrating local inference use case with ExecuTorch.
+We’re excited to share that the newly revamped iOS demo app is live and includes many new updates to provide a more intuitive and smoother user experience with a chat use case! The primary goal of this app is to showcase how easily ExecuTorch can be integrated into an iOS demo app and how to exercise the many features ExecuTorch and Llama models have to offer.
 
-## Prerequisites
-* [Xcode 15](https://developer.apple.com/xcode)
-* [iOS 17 SDK](https://developer.apple.com/ios)
-* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment:
+This app serves as a valuable resource to inspire your creativity and provide foundational code that you can customize and adapt for your particular use case.
 
-```bash
-git clone -b release/0.2 https://github.com/pytorch/executorch.git
-cd executorch
-git submodule update --init
+Please dive in and start exploring our demo app today! We look forward to any feedback and are excited to see your innovative ideas.
 
-python3 -m venv .venv && source .venv/bin/activate
+## Key Concepts
+From this demo app, you will learn many key concepts such as:
+* How to prepare Llama models, build the ExecuTorch library, and perform model inference across delegates
+* Expose the ExecuTorch library via Swift Package Manager
+* Familiarity with current ExecuTorch app-facing capabilities
 
-./install_requirements.sh
-```
+The goal is for you to see the type of support ExecuTorch provides and feel comfortable with leveraging it for your use cases.
+
+## Supported Models
+
+As a whole, the models that this app supports are (varies by delegate):
+* Llama 3.1 8B
+* Llama 3 8B
+* Llama 2 7B
+* Llava 1.5 (only XNNPACK)
+
+## Building the application
+First it’s important to note that currently ExecuTorch provides support across several delegates. Once you identify the delegate of your choice, select the README link to get a complete end-to-end instructions for environment set-up to export the models to build ExecuTorch libraries and apps to run on device:
 
-## Exporting models
-Please refer to the [ExecuTorch Llama2 docs](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) to export the model.
+| Delegate                       | Resource                           |
+| ------------------------------ | ---------------------------------  |
+| XNNPACK (CPU-based library)    | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md)|
+| MPS (Metal Performance Shader) | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md)    |
 
-## Run the App
+## How to Use the App
+This section will provide the main steps to use the app, along with a code snippet of the ExecuTorch API.
 
-1. Open the [project](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj) in Xcode.
-2. Run the app (cmd+R).
-3. In app UI pick a model and tokenizer to use, type a prompt and tap the arrow buton
+### Swift Package Manager
 
-```{note}
 ExecuTorch runtime is distributed as a Swift package providing some .xcframework as prebuilt binary targets.
-Xcode will dowload and cache the package on the first run, which will take some time.
+Xcode will download and cache the package on the first run, which will take some time.
+
+Note: If you're running into any issues related to package dependencies, quit Xcode entirely, delete the whole executorch repo, clean the caches by running the command below in terminal and clone the repo again.
+
 ```
+rm -rf \
+  ~/Library/org.swift.swiftpm \
+  ~/Library/Caches/org.swift.swiftpm \
+  ~/Library/Caches/com.apple.dt.Xcode \
+  ~/Library/Developer/Xcode/DerivedData
+```
+
+Link your binary with the ExecuTorch runtime and any backends or kernels used by the exported ML model. It is recommended to link the core runtime to the components that use ExecuTorch directly, and link kernels and backends against the main app target.
+
+Note: To access logs, link against the Debug build of the ExecuTorch runtime, i.e., the executorch_debug framework. For optimal performance, always link against the Release version of the deliverables (those without the _debug suffix), which have all logging overhead removed.
+
+For more details integrating and Running ExecuTorch on Apple Platforms, checkout this [link](https://pytorch.org/executorch/main/apple-runtime.html).
+
+### XCode
+* Open XCode and select "Open an existing project" to open `examples/demo-apps/apple_ios/LLama`.
+* Ensure that the ExecuTorch package dependencies are installed correctly, then select which ExecuTorch framework should link against which target.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_swift_pm.png" alt="iOS LLaMA App Swift PM" style="width:600px">
+</p>
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_choosing_package.png" alt="iOS LLaMA App Choosing package" style="width:600px">
+</p>
+
+* Run the app. This builds and launches the app on the phone.
+* In app UI pick a model and tokenizer to use, type a prompt and tap the arrow buton
 
 ## Copy the model to Simulator
 
-1. Drag&drop the model and tokenizer files onto the Simulator window and save them somewhere inside the iLLaMA folder.
-2. Pick the files in the app dialog, type a prompt and click the arrow-up button.
+* Drag&drop the model and tokenizer files onto the Simulator window and save them somewhere inside the iLLaMA folder.
+* Pick the files in the app dialog, type a prompt and click the arrow-up button.
 
 ## Copy the model to Device
 
-1. Wire-connect the device and open the contents in Finder.
-2. Navigate to the Files tab and drag&drop the model and tokenizer files onto the iLLaMA folder.
-3. Wait until the files are copied.
+* Wire-connect the device and open the contents in Finder.
+* Navigate to the Files tab and drag&drop the model and tokenizer files onto the iLLaMA folder.
+* Wait until the files are copied.
+
+If the app successfully run on your device, you should see something like below:
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app.jpg" alt="iOS LLaMA App" style="width:300px">
+</p>
 
-Click the image below to see it in action!
+For Llava 1.5 models, you can select and image (via image/camera selector button) before typing prompt and send button.
 
-<a href="https://pytorch.org/executorch/main/_static/img/llama_ios_app.mp4">
-  <img src="https://pytorch.org/executorch/main/_static/img/llama_ios_app.png" width="600" alt="iOS app running a LlaMA model">
-</a>
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_llava.jpg" alt="iOS LLaMA App" style="width:300px">
+</p>
 
 ## Reporting Issues
 If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
new file mode 100644
index 00000000000..20ee73b821f
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
@@ -0,0 +1,114 @@
+# Building Llama iOS Demo for MPS Backend
+
+This tutorial covers the end to end workflow for building an iOS demo app using MPS backend on device.
+More specifically, it covers:
+1. Export and quantization of Llama models against the MPS backend.
+2. Building and linking libraries that are required to inference on-device for iOS platform using MPS.
+3. Building the iOS demo app itself.
+
+## Prerequisites
+* [Xcode 15](https://developer.apple.com/xcode)
+* [iOS 18 SDK](https://developer.apple.com/ios)
+* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment:
+
+## Setup ExecuTorch
+In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://conda.io/projects/conda/en/latest/user-guide/install/index.html)). The commands below are running on Linux (CentOS).
+
+Create a Conda environment
+
+```
+conda create -n et_mps python=3.10.0
+conda activate et_mps
+```
+
+Checkout ExecuTorch repo and sync submodules
+
+```
+git clone https://github.com/pytorch/executorch.git
+cd executorch
+git submodule sync
+git submodule update --init
+```
+
+Install dependencies
+
+```
+./install_requirements.sh
+```
+
+## Prepare Models
+In this demo app, we support text-only inference with Llama 3.1, Llama 3, and Llama 2 models.
+
+Install the required packages to export the model
+
+```
+sh examples/models/llama2/install_requirements.sh
+```
+
+Export the model
+```
+python -m examples.models.llama2.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32
+```
+
+## Pushing Model and Tokenizer
+
+### Copy the model to Simulator
+* Drag&drop the model and tokenizer files onto the Simulator window and save them somewhere inside the iLLaMA folder.
+* Pick the files in the app dialog, type a prompt and click the arrow-up button.
+
+### Copy the model to Device
+* Wire-connect the device and open the contents in Finder.
+* Navigate to the Files tab and drag & drop the model and tokenizer files onto the iLLaMA folder.
+* Wait until the files are copied.
+
+## Configure the XCode Project
+
+### Install CMake
+Download and open the macOS .dmg installer at https://cmake.org/download and move the Cmake app to /Applications folder.
+Install Cmake command line tools:
+
+```
+sudo /Applications/CMake.app/Contents/bin/cmake-gui --install
+```
+
+
+### Swift Package Manager
+The prebuilt ExecuTorch runtime, backend, and kernels are available as a Swift PM package.
+
+### Xcode
+Open the project in Xcode.In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the ExecuTorch repo into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version, e.g., “0.3.0”, or just use the “latest” branch name for the latest stable build.
+
+Note: If you're running into any issues related to package dependencies, quit Xcode entirely, delete the whole executorch repo, clean the caches by running the command below in terminal and clone the repo again.
+
+```
+rm -rf \
+  ~/Library/org.swift.swiftpm \
+  ~/Library/Caches/org.swift.swiftpm \
+  ~/Library/Caches/com.apple.dt.Xcode \
+  ~/Library/Developer/Xcode/DerivedData
+```
+
+Link your binary with the ExecuTorch runtime and any backends or kernels used by the exported ML model. It is recommended to link the core runtime to the components that use ExecuTorch directly, and link kernels and backends against the main app target.
+
+Note: To access logs, link against the Debug build of the ExecuTorch runtime, i.e., the executorch_debug framework. For optimal performance, always link against the Release version of the deliverables (those without the _debug suffix), which have all logging overhead removed.
+
+For more details integrating and Running ExecuTorch on Apple Platforms, checkout this [link](https://pytorch.org/executorch/main/apple-runtime.html).
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_swift_pm.png" alt="iOS LLaMA App Swift PM" style="width:600px">
+</p>
+
+Then select which ExecuTorch framework should link against which target.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_choosing_package.png" alt="iOS LLaMA App Choosing package" style="width:600px">
+</p>
+
+Click “Run” to build the app and run in on your iPhone. If the app successfully run on your device, you should see something like below:
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_mps.jpg" alt="iOS LLaMA App mps" style="width:300px">
+</p>
+
+## Reporting Issues
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
new file mode 100644
index 00000000000..6497449ba25
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -0,0 +1,120 @@
+# Building Llama iOS Demo for XNNPack Backend
+
+This tutorial covers the end to end workflow for building an iOS demo app using XNNPack backend on device.
+More specifically, it covers:
+1. Export and quantization of Llama models against the XNNPack backend.
+2. Building and linking libraries that are required to inference on-device for iOS platform using XNNPack.
+3. Building the iOS demo app itself.
+
+## Prerequisites
+* [Xcode 15](https://developer.apple.com/xcode)
+* [iOS 17 SDK](https://developer.apple.com/ios)
+* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment:
+
+## Setup ExecuTorch
+In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://conda.io/projects/conda/en/latest/user-guide/install/index.html)). The commands below are running on Linux (CentOS).
+
+Create a Conda environment
+
+```
+conda create -n et_xnnpack python=3.10.0
+conda activate et_xnnpack
+```
+
+Checkout ExecuTorch repo and sync submodules
+
+```
+git clone https://github.com/pytorch/executorch.git
+cd executorch
+git submodule sync
+git submodule update --init
+```
+
+Install dependencies
+
+```
+./install_requirements.sh
+```
+
+## Prepare Models
+In this demo app, we support text-only inference with up-to-date Llama models.
+
+Install the required packages to export the model
+
+```
+sh examples/models/llama2/install_requirements.sh
+```
+
+Export the model
+```
+python -m examples.models.llama2.export_llama --checkpoint <consolidated.00.pth> -p <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w  --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+```
+
+## Pushing Model and Tokenizer
+
+### Copy the model to Simulator
+* Drag&drop the model and tokenizer files onto the Simulator window and save them somewhere inside the iLLaMA folder.
+* Pick the files in the app dialog, type a prompt and click the arrow-up button.
+
+### Copy the model to Device
+* Wire-connect the device and open the contents in Finder.
+* Navigate to the Files tab and drag & drop the model and tokenizer files onto the iLLaMA folder.
+* Wait until the files are copied.
+
+## Configure the XCode Project
+
+### Install CMake
+Download and open the macOS .dmg installer at https://cmake.org/download and move the Cmake app to /Applications folder.
+Install Cmake command line tools:
+
+```
+sudo /Applications/CMake.app/Contents/bin/cmake-gui --install
+```
+
+
+### Swift Package Manager
+The prebuilt ExecuTorch runtime, backend, and kernels are available as a Swift PM package.
+
+### Xcode
+Open the project in Xcode.In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the ExecuTorch repo into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version, e.g., “0.3.0”, or just use the “latest” branch name for the latest stable build.
+
+Note: If you're running into any issues related to package dependencies, quit Xcode entirely, delete the whole executorch repo, clean the caches by running the command below in terminal and clone the repo again.
+
+```
+rm -rf \
+  ~/Library/org.swift.swiftpm \
+  ~/Library/Caches/org.swift.swiftpm \
+  ~/Library/Caches/com.apple.dt.Xcode \
+  ~/Library/Developer/Xcode/DerivedData
+```
+
+Link your binary with the ExecuTorch runtime and any backends or kernels used by the exported ML model. It is recommended to link the core runtime to the components that use ExecuTorch directly, and link kernels and backends against the main app target.
+
+Note: To access logs, link against the Debug build of the ExecuTorch runtime, i.e., the executorch_debug framework. For optimal performance, always link against the Release version of the deliverables (those without the _debug suffix), which have all logging overhead removed.
+
+For more details integrating and Running ExecuTorch on Apple Platforms, checkout this [link](https://pytorch.org/executorch/main/apple-runtime.html).
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_swift_pm.png" alt="iOS LLaMA App Swift PM" style="width:600px">
+</p>
+
+Then select which ExecuTorch framework should link against which target.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_choosing_package.png" alt="iOS LLaMA App Choosing package" style="width:600px">
+</p>
+
+Click “Run” to build the app and run in on your iPhone. If the app successfully run on your device, you should see something like below:
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app.jpg" alt="iOS LLaMA App" style="width:300px">
+</p>
+
+For Llava 1.5 models, you can select and image (via image/camera selector button) before typing prompt and send button.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_llava.jpg" alt="iOS LLaMA App" style="width:300px">
+</p>
+
+## Reporting Issues
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/models/flamingo/passes/__init__.py b/examples/demo-apps/apple_ios/LLaMA/llama31.png
similarity index 100%
rename from examples/models/flamingo/passes/__init__.py
rename to examples/demo-apps/apple_ios/LLaMA/llama31.png
diff --git a/examples/demo-apps/apple_ios/LLaMA/llava.png b/examples/demo-apps/apple_ios/LLaMA/llava.png
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml b/examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml
new file mode 100644
index 00000000000..fcb2e7a978c
--- /dev/null
+++ b/examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml
@@ -0,0 +1,33 @@
+version: 0.1
+
+# Phases are collection of commands that get executed on Device Farm.
+phases:
+  # The install phase includes commands that install dependencies that your tests use.
+  # Default dependencies for testing frameworks supported on Device Farm are already installed.
+  install:
+    commands:
+
+  # The pre-test phase includes commands that setup your test environment.
+  pre_test:
+    commands:
+      - mkdir $DEVICEFARM_TEST_PACKAGE_PATH/Debug-iphoneos
+      - mkdir $DEVICEFARM_TEST_PACKAGE_PATH/Release-iphoneos
+      - unzip $DEVICEFARM_APP_PATH -d /tmp
+      - cp -r /tmp/Payload/*.app $DEVICEFARM_TEST_PACKAGE_PATH/Debug-iphoneos/
+      - cp -r /tmp/Payload/*.app $DEVICEFARM_TEST_PACKAGE_PATH/Release-iphoneos/
+
+  # The test phase includes commands that run your test suite execution.
+  test:
+    commands:
+      - xcodebuild test-without-building -destination id=$DEVICEFARM_DEVICE_UDID -xctestrun $DEVICEFARM_TEST_PACKAGE_PATH/*.xctestrun -derivedDataPath $DEVICEFARM_LOG_DIR
+
+  # The post test phase includes are commands that are run after your tests are executed.
+  post_test:
+    commands:
+
+# The artifacts phase lets you specify the location where your tests logs, device logs will be stored.
+# And also let you specify the location of your test logs and artifacts which you want to be collected by Device Farm.
+# These logs and artifacts will be available through ListArtifacts API in Device Farm.
+artifacts:
+  # By default, Device Farm will collect your artifacts from following directories
+  - $DEVICEFARM_LOG_DIR
diff --git a/examples/sdk/CMakeLists.txt b/examples/devtools/CMakeLists.txt
similarity index 61%
rename from examples/sdk/CMakeLists.txt
rename to examples/devtools/CMakeLists.txt
index 76034b07601..7ed5232ba41 100644
--- a/examples/sdk/CMakeLists.txt
+++ b/examples/devtools/CMakeLists.txt
@@ -4,10 +4,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Example CMakeLists.txt for building executor_runner with sdk support. In this
-# example we link sdk and bundled_program libraries into executor_runner binary
+# Example CMakeLists.txt for building executor_runner with Developer Tools
+# support. In this example we link devtools and bundled_program libraries into
+# executor_runner binary
 cmake_minimum_required(VERSION 3.19)
-project(sdk_example)
+project(devtools_example)
 
 option(EXECUTORCH_BUILD_COREML "Build the Core ML backend" OFF)
 
@@ -45,15 +46,15 @@ find_package(
   gflags REQUIRED PATHS ${CMAKE_CURRENT_BINARY_DIR}/../../third-party
 )
 
-add_executable(sdk_example_runner sdk_example_runner/sdk_example_runner.cpp)
+add_executable(example_runner example_runner/example_runner.cpp)
 target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
 
 target_include_directories(
-  etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../sdk/include
+  etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../devtools/include
                    ${EXECUTORCH_ROOT}/third-party/flatcc/include
 )
 target_link_libraries(
-  sdk_example_runner
+  example_runner
   executorch
   gflags
   etdump
@@ -65,26 +66,27 @@ target_link_libraries(
 )
 
 if(EXECUTORCH_BUILD_COREML)
-find_library(ACCELERATE_FRAMEWORK Accelerate)
-find_library(COREML_FRAMEWORK CoreML)
-find_library(FOUNDATION_FRAMEWORK Foundation)
-find_library(SQLITE_LIBRARY sqlite3)
-
-set(PROTOBUF_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../backends/apple/coreml/third-party/coremltools/deps/protobuf/cmake)
-find_library(PROTOBUF_LITE REQUIRED NAMES libprotobuf-lite.a PATHS ${PROTOBUF_LIB_DIR} NO_DEFAULT_PATH)
-  
-target_link_libraries(
-  sdk_example_runner
-  "-Wl,-force_load"
-  coremldelegate
-)
-
-target_link_libraries(
-  sdk_example_runner
-  ${PROTOBUF_LITE}
-  ${ACCELERATE_FRAMEWORK}
-  ${COREML_FRAMEWORK}
-  ${FOUNDATION_FRAMEWORK}
-  ${SQLITE_LIBRARY}
-)
+  find_library(ACCELERATE_FRAMEWORK Accelerate)
+  find_library(COREML_FRAMEWORK CoreML)
+  find_library(FOUNDATION_FRAMEWORK Foundation)
+  find_library(SQLITE_LIBRARY sqlite3)
+
+  set(PROTOBUF_LIB_DIR
+      ${CMAKE_CURRENT_BINARY_DIR}/../../backends/apple/coreml/third-party/coremltools/deps/protobuf/cmake
+  )
+  find_library(
+    PROTOBUF_LITE REQUIRED
+    NAMES libprotobuf-lite.a
+    PATHS ${PROTOBUF_LIB_DIR}
+    NO_DEFAULT_PATH
+  )
+
+  target_link_libraries(
+    example_runner "-Wl,-force_load" coremldelegate
+  )
+
+  target_link_libraries(
+    example_runner ${PROTOBUF_LITE} ${ACCELERATE_FRAMEWORK}
+    ${COREML_FRAMEWORK} ${FOUNDATION_FRAMEWORK} ${SQLITE_LIBRARY}
+  )
 endif()
diff --git a/examples/sdk/README.md b/examples/devtools/README.md
similarity index 69%
rename from examples/sdk/README.md
rename to examples/devtools/README.md
index 68043517fba..36cc746d3fe 100644
--- a/examples/sdk/README.md
+++ b/examples/devtools/README.md
@@ -1,43 +1,41 @@
-# SDK Examples
+# Developer Tools Examples
 This directory contains examples of BundledProgram and ETDump generation.
 
 ## Directory structure
 ```bash
-examples/sdk
+examples/devtools
 ├── scripts                           # Python scripts to illustrate export workflow of bundled program.
-├── sdk_executor_runner               # Contains an example for both BundledProgram to verify ExecuTorch model, and generate ETDump for runtime results.
+├── executor_runner                   # Contains an example for both BundledProgram to verify ExecuTorch model, and generate ETDump for runtime results.
 └── README.md                         # Current file
 ```
 
 ## BundledProgram
 
-We will use an example model (in `torch.nn.Module`) and its representative inputs, both from [`models/`](../models) directory, to generate a [BundledProgram(`.bpte`)](../../docs/source/sdk-bundled-io.md) file using the [script](scripts/export_bundled_program.py). Then we will use [sdk_example_runner](sdk_example_runner/sdk_example_runner.cpp) to execute the `.bpte` model on the ExecuTorch runtime and verify the model on BundledProgram API.
+We will use an example model (in `torch.nn.Module`) and its representative inputs, both from [`models/`](../models) directory, to generate a [BundledProgram(`.bpte`)](../../docs/source/sdk-bundled-io.md) file using the [script](scripts/export_bundled_program.py). Then we will use [devtools/example_runner](example_runner/example_runner.cpp) to execute the `.bpte` model on the ExecuTorch runtime and verify the model on BundledProgram API.
 
 
 1. Sets up the basic development environment for ExecuTorch by [Setting up ExecuTorch from GitHub](https://pytorch.org/executorch/stable/getting-started-setup).
 
-2. Using the [script](scripts/export_bundled_program.py) to generate a BundledProgram binary file by retreiving a `torch.nn.Module` model and its representative inputs from the list of available models in the [`models/`](../models) dir。
+2. Using the [script](scripts/export_bundled_program.py) to generate a BundledProgram binary file by retreiving a `torch.nn.Module` model and its representative inputs from the list of available models in the [`models/`](../models) dir.
 
 ```bash
 cd executorch # To the top level dir
 
 # To get a list of example models
-python3 -m examples.sdk.scripts.export_bundled_program -h
+python3 -m examples.devtools.scripts.export_bundled_program -h
 
 # To generate a specific `.bpte` model
-python3 -m examples.sdk.scripts.export_bundled_program -m mv2 # for MobileNetv2
+python3 -m examples.devtools.scripts.export_bundled_program -m mv2 # for MobileNetv2
 
 # This should generate ./mv2_bundled.bpte file, if successful.
 ```
 
-3. Once we have the BundledProgram binary (`.bpte`) file, then let's run and verify it with ExecuTorch runtime and BundledProgram APIs using the [sdk_example_runner](sdk_example_runner/sdk_example_runner.cpp).
+3. Once we have the BundledProgram binary (`.bpte`) file, then let's run and verify it with ExecuTorch runtime and BundledProgram APIs using the [devtools/example_runner](example_runner/example_runner.cpp).
 
 ```bash
    cd executorch
-   rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake -DEXECUTORCH_BUILD_SDK=1 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=1 ..
-   cd ..
-   cmake --build cmake-out -j8 -t sdk_example_runner
-   ./cmake-out/examples/sdk/sdk_example_runner --bundled_program_path mv2_bundled.bpte --output_verification
+   ./examples/devtools/build_example_runner.sh
+   ./cmake-out/examples/devtools/example_runner --bundled_program_path mv2_bundled.bpte --output_verification
    ```
 
 
@@ -51,7 +49,7 @@ We offer an example runner that accepts a `BundledProgram` (`.bpte`) and runs a
 Running the program will generate an `ETDump` file (`.etdp`) at the location specified by `--etdump_path`. Make sure to build the program as specified below to enable the event tracer.
 
 ```bash
-   ./cmake-out/examples/sdk/sdk_example_runner --bundled_program_path mv2_bundled.bpte --etdump_path mv2_etdump.etdp
+   ./cmake-out/examples/devtools/example_runner --bundled_program_path mv2_bundled.bpte --etdump_path mv2_etdump.etdp
    ```
 
 ### Parsing ETDump
@@ -59,14 +57,14 @@ Running the program will generate an `ETDump` file (`.etdp`) at the location spe
 Once an `ETDump` has been generated, it can be viewed using the CLI inspector. This will print a tabular view of the data recorded in the ETDump.
 
 ```bash
-   python3 -m sdk.inspector.inspector_cli --etdump_path mv2_etdump.etdp
+   python3 -m devtools.inspector.inspector_cli --etdump_path mv2_etdump.etdp
    ```
 ### ETDump C++ API
 
-ETDump profiling can also be used in a custom C++ program. `ETDumpGen` is an implementation of the abstract `EventTracer` class.  Include the header file located at `sdk/etdump/etdump_flatcc.h`. To initialize the ETDump generator, construct it before loading the method from the program.
+ETDump profiling can also be used in a custom C++ program. `ETDumpGen` is an implementation of the abstract `EventTracer` class.  Include the header file located at `devtools/etdump/etdump_flatcc.h`. To initialize the ETDump generator, construct it before loading the method from the program.
 
 ```cpp
-   torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+   executorch::etdump::ETDumpGen etdump_gen;
    Result<Method> method =
       program->load_method(method_name, &memory_manager, &etdump_gen);
    ```
diff --git a/examples/sdk/build_sdk_example_runner.sh b/examples/devtools/build_example_runner.sh
similarity index 82%
rename from examples/sdk/build_sdk_example_runner.sh
rename to examples/devtools/build_example_runner.sh
index be0e61cef79..9f35abb1a35 100755
--- a/examples/sdk/build_sdk_example_runner.sh
+++ b/examples/devtools/build_example_runner.sh
@@ -5,7 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Builds sdk_example_runner and prints its path.
+# Builds example_runner and prints its path.
 
 set -euo pipefail
 
@@ -20,9 +20,9 @@ export CMAKE_BUILD_PARALLEL_LEVEL="${CMAKE_BUILD_PARALLEL_LEVEL:-9}"
 BUILD_COREML=OFF
 
 usage() {
-  echo "Builds sdk example runner."
+  echo "Builds example runner."
   echo "Options:"
-  echo "  --coreml             Include this flag to enable Core ML backend when building the SDK."
+  echo "  --coreml             Include this flag to enable Core ML backend when building the Developer Tools."
   exit 0
 }
 
@@ -42,7 +42,7 @@ main() {
   if [[ "${BUILD_COREML}" == "ON" ]]; then
     cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE=Release \
-        -DEXECUTORCH_BUILD_SDK=ON \
+        -DEXECUTORCH_BUILD_DEVTOOLS=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DEXECUTORCH_BUILD_COREML=ON \
         -Dprotobuf_BUILD_TESTS=OFF \
@@ -52,14 +52,14 @@ main() {
   else
    cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
       -DCMAKE_BUILD_TYPE=Release \
-      -DEXECUTORCH_BUILD_SDK=ON \
+      -DEXECUTORCH_BUILD_DEVTOOLS=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
       -Bcmake-out .
   fi
 
   cmake --build cmake-out --target install --config Release
 
-  local example_dir=examples/sdk
+  local example_dir=examples/devtools
   local build_dir="cmake-out/${example_dir}"
   local cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
   rm -rf ${build_dir}
@@ -70,12 +70,12 @@ main() {
       "${example_dir}"
   cmake --build "${build_dir}" --config Release
 
-  local runner="${PWD}/${build_dir}/sdk_example_runner"
+  local runner="${PWD}/${build_dir}/example_runner"
   if [[ ! -f "${runner}" ]]; then
-    echo "ERROR: Failed to build ${build_dir}/sdk_example_runner" >&2
+    echo "ERROR: Failed to build ${build_dir}/example_runner" >&2
     exit 1
   else
-    echo "Built ${build_dir}/sdk_example_runner"
+    echo "Built ${build_dir}/example_runner"
   fi
 }
 
diff --git a/backends/xnnpack/threadpool/TARGETS b/examples/devtools/example_runner/TARGETS
similarity index 100%
rename from backends/xnnpack/threadpool/TARGETS
rename to examples/devtools/example_runner/TARGETS
diff --git a/examples/sdk/sdk_example_runner/sdk_example_runner.cpp b/examples/devtools/example_runner/example_runner.cpp
similarity index 86%
rename from examples/sdk/sdk_example_runner/sdk_example_runner.cpp
rename to examples/devtools/example_runner/example_runner.cpp
index e2e42ab670a..1aae0f2a98f 100644
--- a/examples/sdk/sdk_example_runner/sdk_example_runner.cpp
+++ b/examples/devtools/example_runner/example_runner.cpp
@@ -22,18 +22,15 @@
 
 #include <gflags/gflags.h>
 
+#include <executorch/devtools/bundled_program/bundled_program.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/sdk/bundled_program/bundled_program.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
-#include <executorch/util/util.h>
 
-static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB
-static constexpr size_t kBundledAllocatorPoolSize = 16 * 1024U;
-static uint8_t bundled_allocator_pool[kBundledAllocatorPoolSize];
+static std::array<uint8_t, 4 * 1024U * 1024U> method_allocator_pool; // 4MB
 
 DEFINE_string(
     bundled_program_path,
@@ -78,7 +75,20 @@ DEFINE_int32(
     262144, // 256 KB
     "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging.");
 
-using namespace torch::executor;
+using executorch::etdump::ETDumpGen;
+using executorch::etdump::ETDumpResult;
+using executorch::extension::BufferDataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::EventTracerDebugLogLevel;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
 
 std::vector<uint8_t> load_file_or_die(const char* path) {
   std::ifstream file(path, std::ios::binary | std::ios::ate);
@@ -93,7 +103,7 @@ std::vector<uint8_t> load_file_or_die(const char* path) {
 }
 
 int main(int argc, char** argv) {
-  runtime_init();
+  executorch::runtime::runtime_init();
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (argc != 1) {
@@ -112,19 +122,18 @@ int main(int argc, char** argv) {
   // Find the offset to the embedded Program.
   const void* program_data;
   size_t program_data_len;
-  Error status = torch::executor::bundled_program::GetProgramData(
+  Error status = executorch::bundled_program::get_program_data(
       reinterpret_cast<void*>(file_data.data()),
       file_data.size(),
       &program_data,
       &program_data_len);
   ET_CHECK_MSG(
       status == Error::Ok,
-      "GetProgramData() failed on file '%s': 0x%x",
+      "get_program_data() failed on file '%s': 0x%x",
       bundled_program_path,
       (unsigned int)status);
 
-  auto buffer_data_loader =
-      util::BufferDataLoader(program_data, program_data_len);
+  auto buffer_data_loader = BufferDataLoader(program_data, program_data_len);
 
   // Parse the program file. This is immutable, and can also be reused
   // between multiple execution invocations across multiple threads.
@@ -171,8 +180,8 @@ int main(int argc, char** argv) {
   // MallocMemoryAllocator).
   //
   // In this example we use a statically allocated memory pool.
-  MemoryAllocator method_allocator{
-      MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
+  MemoryAllocator method_allocator{MemoryAllocator(
+      sizeof(method_allocator_pool), method_allocator_pool.data())};
 
   // The memory-planned buffers will back the mutable tensors used by the
   // method. The sizes of these buffers were determined ahead of time during the
@@ -205,7 +214,7 @@ int main(int argc, char** argv) {
   // the method can mutate the memory-planned buffers, so the method should only
   // be used by a single thread at at time, but it can be reused.
   //
-  torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+  ETDumpGen etdump_gen;
   Result<Method> method =
       program->load_method(method_name, &memory_manager, &etdump_gen);
   ET_CHECK_MSG(
@@ -228,7 +237,7 @@ int main(int argc, char** argv) {
         EventTracerDebugLogLevel::kProgramOutputs);
   }
   // Use the inputs embedded in the bundled program.
-  status = torch::executor::bundled_program::LoadBundledInput(
+  status = executorch::bundled_program::load_bundled_input(
       *method, file_data.data(), FLAGS_testset_idx);
   ET_CHECK_MSG(
       status == Error::Ok,
@@ -265,7 +274,7 @@ int main(int argc, char** argv) {
 
   // Dump the etdump data containing profiling/debugging data to the specified
   // file.
-  etdump_result result = etdump_gen.get_etdump_data();
+  ETDumpResult result = etdump_gen.get_etdump_data();
   if (result.buf != nullptr && result.size > 0) {
     FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+");
     fwrite((uint8_t*)result.buf, 1, result.size, f);
@@ -275,14 +284,13 @@ int main(int argc, char** argv) {
 
   if (FLAGS_output_verification) {
     // Verify the outputs.
-    status =
-        torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput(
-            *method,
-            file_data.data(),
-            FLAGS_testset_idx,
-            1e-3, // rtol
-            1e-5 // atol
-        );
+    status = executorch::bundled_program::verify_method_outputs(
+        *method,
+        file_data.data(),
+        FLAGS_testset_idx,
+        1e-3, // rtol
+        1e-5 // atol
+    );
     ET_CHECK_MSG(
         status == Error::Ok,
         "Bundle verification failed with status 0x%" PRIx32,
diff --git a/examples/sdk/sdk_example_runner/targets.bzl b/examples/devtools/example_runner/targets.bzl
similarity index 80%
rename from examples/sdk/sdk_example_runner/targets.bzl
rename to examples/devtools/example_runner/targets.bzl
index a5e8feb33c3..6faf53173d5 100644
--- a/examples/sdk/sdk_example_runner/targets.bzl
+++ b/examples/devtools/example_runner/targets.bzl
@@ -9,9 +9,9 @@ def define_common_targets():
 
     # Test driver for models with bundled inputs.
     runtime.cxx_binary(
-        name = "sdk_example_runner",
+        name = "example_runner",
         srcs = [
-            "sdk_example_runner.cpp",
+            "example_runner.cpp",
         ],
         deps = [
             "//executorch/runtime/executor/test:test_backend_compiler_lib",
@@ -19,9 +19,8 @@ def define_common_targets():
             "//executorch/runtime/executor:program",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/data_loader:buffer_data_loader",
-            "//executorch/util:util",
-            "//executorch/sdk/etdump:etdump_flatcc",
-            "//executorch/sdk/bundled_program:runtime",
+            "//executorch/devtools/etdump:etdump_flatcc",
+            "//executorch/devtools/bundled_program:runtime",
         ],
         external_deps = [
             "gflags",
diff --git a/examples/sdk/scripts/etrecord.bin b/examples/devtools/scripts/etrecord.bin
similarity index 100%
rename from examples/sdk/scripts/etrecord.bin
rename to examples/devtools/scripts/etrecord.bin
diff --git a/examples/sdk/scripts/export_bundled_program.py b/examples/devtools/scripts/export_bundled_program.py
similarity index 95%
rename from examples/sdk/scripts/export_bundled_program.py
rename to examples/devtools/scripts/export_bundled_program.py
index 7e118a78c1d..143a7b0e666 100644
--- a/examples/sdk/scripts/export_bundled_program.py
+++ b/examples/devtools/scripts/export_bundled_program.py
@@ -6,24 +6,26 @@
 
 # Example script for exporting simple models to flatbuffer
 
+# pyre-unsafe
+
 import argparse
 
 from typing import List
 
 import torch
-
-from executorch.exir import ExecutorchProgramManager
-from executorch.extension.export_util.utils import export_to_exec_prog
-from executorch.sdk import BundledProgram
-from executorch.sdk.bundled_program.config import (
+from executorch.devtools import BundledProgram
+from executorch.devtools.bundled_program.config import (
     MethodInputType,
     MethodTestCase,
     MethodTestSuite,
 )
-from executorch.sdk.bundled_program.serialize import (
+from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
 
+from executorch.exir import ExecutorchProgramManager
+from executorch.extension.export_util.utils import export_to_exec_prog
+
 from ...models import MODEL_NAME_TO_MODEL
 from ...models.model_factory import EagerModelFactory
 
@@ -37,7 +39,7 @@ def save_bundled_program(
     Generates a bundled program from the given ET program and saves it to the specified path.
 
     Args:
-        program: The ExecuTorch program to bundle.
+        executorch_program: The ExecuTorch program to bundle.
         method_test_suites: The MethodTestSuites which contains test cases to include in the bundled program.
         output_path: Path to save the bundled program.
     """
diff --git a/examples/sdk/scripts/gen_sample_etrecord.py b/examples/devtools/scripts/gen_sample_etrecord.py
similarity index 91%
rename from examples/sdk/scripts/gen_sample_etrecord.py
rename to examples/devtools/scripts/gen_sample_etrecord.py
index c219ed4094f..9194b7caa23 100644
--- a/examples/sdk/scripts/gen_sample_etrecord.py
+++ b/examples/devtools/scripts/gen_sample_etrecord.py
@@ -4,12 +4,15 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 # Generate fixture files
 import argparse
 import copy
 from typing import Any
 
 import torch
+from executorch.devtools import generate_etrecord
 from executorch.exir import (
     EdgeCompileConfig,
     EdgeProgramManager,
@@ -17,8 +20,6 @@
     ExportedProgram,
     to_edge,
 )
-from executorch.exir.capture._config import ExecutorchBackendConfig
-from executorch.sdk import generate_etrecord
 from torch.export import export
 
 from ...models import MODEL_NAME_TO_MODEL
@@ -38,9 +39,7 @@ def gen_etrecord(model: torch.nn.Module, inputs: Any, output_path=None):
         aten_dialect, compile_config=EdgeCompileConfig(_check_ir_validity=True)
     )
     edge_program_copy = copy.deepcopy(edge_program)
-    et_program: ExecutorchProgramManager = edge_program_copy.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
-    )
+    et_program: ExecutorchProgramManager = edge_program_copy.to_executorch()
     generate_etrecord(
         (DEFAULT_OUTPUT_PATH if not output_path else output_path),
         edge_dialect_program=edge_program,
diff --git a/examples/sdk/test_sdk_example_runner.sh b/examples/devtools/test_example_runner.sh
similarity index 72%
rename from examples/sdk/test_sdk_example_runner.sh
rename to examples/devtools/test_example_runner.sh
index 5185def6552..9c9ed782cbe 100644
--- a/examples/sdk/test_sdk_example_runner.sh
+++ b/examples/devtools/test_example_runner.sh
@@ -5,7 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Test the end-to-end flow of building sdk_example_runner and use it to run
+# Test the end-to-end flow of building devtools/example_runner and use it to run
 # an actual model.
 
 
@@ -14,23 +14,23 @@ set -e
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/../../.ci/scripts/utils.sh"
 
-cmake_install_executorch_sdk_lib() {
+cmake_install_executorch_devtools_lib() {
   echo "Installing libexecutorch.a, libportable_kernels.a, libetdump.a, libbundled_program.a"
   rm -rf cmake-out
 
   retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
-          -DEXECUTORCH_BUILD_SDK=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
   cmake --build cmake-out -j9 --target install --config Release
 }
 
-test_cmake_sdk_example_runner() {
+test_cmake_devtools_example_runner() {
   echo "Exporting MobilenetV2"
-  ${PYTHON_EXECUTABLE} -m examples.sdk.scripts.export_bundled_program --model_name="mv2"
-  local example_dir=examples/sdk
+  ${PYTHON_EXECUTABLE} -m examples.devtools.scripts.export_bundled_program --model_name="mv2"
+  local example_dir=examples/devtools
   local build_dir=cmake-out/${example_dir}
   CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
   rm -rf ${build_dir}
@@ -44,8 +44,8 @@ test_cmake_sdk_example_runner() {
   echo "Building ${example_dir}"
   cmake --build ${build_dir} -j9 --config Release
 
-  echo 'Running sdk_example_runner'
-  ${build_dir}/sdk_example_runner --bundled_program_path="./mv2_bundled.bpte"
+  echo 'Running example_runner'
+  ${build_dir}/example_runner --bundled_program_path="./mv2_bundled.bpte"
 }
 
 if [[ -z $PYTHON_EXECUTABLE ]];
@@ -58,5 +58,5 @@ then
   BUCK=buck2
 fi
 
-cmake_install_executorch_sdk_lib
-test_cmake_sdk_example_runner
+cmake_install_executorch_devtools_lib
+test_cmake_devtools_example_runner
diff --git a/examples/llm_manual/CMakeLists.txt b/examples/llm_manual/CMakeLists.txt
index 185665180f9..e5054a683a6 100644
--- a/examples/llm_manual/CMakeLists.txt
+++ b/examples/llm_manual/CMakeLists.txt
@@ -13,6 +13,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
 option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend
 
@@ -29,6 +30,7 @@ target_link_libraries(
   nanogpt_runner
   PRIVATE executorch
           extension_module_static # Provides the Module class
+          extension_tensor # Provides the TensorPtr class
           optimized_native_cpu_ops_lib # Provides baseline cross-platform
                                        # kernels
           xnnpack_backend
diff --git a/examples/llm_manual/basic_sampler.h b/examples/llm_manual/basic_sampler.h
index a95b823de8d..b4fc8fe248c 100644
--- a/examples/llm_manual/basic_sampler.h
+++ b/examples/llm_manual/basic_sampler.h
@@ -6,12 +6,14 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#pragma once
+
 #include <algorithm>
 #include <vector>
+
 class BasicSampler {
  public:
-  BasicSampler() {}
-  int64_t sample(std::vector<float> logits) {
+  int64_t sample(const std::vector<float>& logits) {
     // Find the token with the highest log probability.
     int64_t max_index =
         std::max_element(logits.begin(), logits.end()) - logits.begin();
diff --git a/examples/llm_manual/basic_tokenizer.h b/examples/llm_manual/basic_tokenizer.h
index eb51d15fc50..385bfaaca48 100644
--- a/examples/llm_manual/basic_tokenizer.h
+++ b/examples/llm_manual/basic_tokenizer.h
@@ -6,21 +6,22 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#pragma once
+
 #include <fstream>
 #include <iostream>
-#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 class BasicTokenizer {
  public:
-  BasicTokenizer(const std::string& filePath) {
-    std::ifstream file(filePath);
+  explicit BasicTokenizer(const std::string& file_path) {
+    std::ifstream file(file_path);
 
     if (!file) {
-      std::cerr << "Unable to open file";
-      exit(9); // return with error code
+      std::cerr << "Unable to open file " << file_path << "\n";
+      exit(9);
     }
     std::string str(
         (std::istreambuf_iterator<char>(file)),
diff --git a/examples/llm_manual/main.cpp b/examples/llm_manual/main.cpp
index 992272a416b..3c4ecd71af0 100644
--- a/examples/llm_manual/main.cpp
+++ b/examples/llm_manual/main.cpp
@@ -6,31 +6,24 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// main.cpp
-
 #include <cstdint>
-#include <functional>
-#include <memory>
-#include <unordered_map>
 
 #include "basic_sampler.h"
 #include "basic_tokenizer.h"
-#include "managed_tensor.h"
 
-#include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-
-using namespace torch::executor;
+#include <executorch/runtime/core/result.h>
 
-using SizesType = exec_aten::SizesType;
-using DimOrderType = exec_aten::DimOrderType;
-using StridesType = exec_aten::StridesType;
-
-// main.cpp
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::extension::Module;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
 
+// The value of the gpt2 `<|endoftext|>` token.
 #define ENDOFTEXT 50256
 
 std::string generate(
@@ -49,14 +42,13 @@ std::string generate(
   for (auto i = 0u; i < max_output_length; i++) {
     // Convert the input_tokens from a vector of int64_t to EValue.
     // EValue is a unified data type in the ExecuTorch runtime.
-    ManagedTensor tensor_tokens(
+    auto inputs = from_blob(
         input_tokens.data(),
         {1, static_cast<int>(input_tokens.size())},
         ScalarType::Long);
-    std::vector<EValue> inputs = {tensor_tokens.get_tensor()};
 
     // Run the model. It will return a tensor of logits (log-probabilities).
-    Result<std::vector<EValue>> logits_evalue = llm_model.forward(inputs);
+    auto logits_evalue = llm_model.forward(inputs);
 
     // Convert the output logits from EValue to std::vector, which is what
     // the sampler expects.
diff --git a/examples/llm_manual/managed_tensor.h b/examples/llm_manual/managed_tensor.h
deleted file mode 100644
index d870f4861e6..00000000000
--- a/examples/llm_manual/managed_tensor.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-#include <executorch/runtime/platform/assert.h>
-
-#include <executorch/runtime/core/portable_type/tensor.h>
-
-#pragma once
-
-namespace torch {
-namespace executor {
-
-/**
- * A tensor wrapper takes ownership of all the memory of the necessary metadata
- * for torch::executor::Tensor. Note that it doesn't own the data memory.
- */
-class ManagedTensor {
- public:
-  /// The type used for elements of `sizes()`.
-  using SizesType = exec_aten::SizesType;
-  /// The type used for elements of `dim_order()`.
-  using DimOrderType = exec_aten::DimOrderType;
-  /// The type used for elements of `strides()`.
-  using StridesType = exec_aten::StridesType;
-
-  ManagedTensor() = delete;
-
-  explicit ManagedTensor(
-      void* data,
-      const std::vector<SizesType>& sizes,
-      ScalarType dtype)
-      : sizes_(sizes) {
-    tensor_impl_ = std::make_unique<TensorImpl>(
-        dtype,
-        sizes_.size(),
-        sizes_.data(),
-        data,
-        nullptr,
-        nullptr,
-        TensorShapeDynamism::DYNAMIC_BOUND);
-  }
-
-  /**
-   * Get the Tensor object managed by this class.
-   */
-  Tensor get_tensor() {
-    return Tensor(tensor_impl_.get());
-  }
-
- private:
-  std::unique_ptr<TensorImpl> tensor_impl_;
-  std::vector<SizesType> sizes_;
-};
-
-} // namespace executor
-} // namespace torch
diff --git a/examples/llm_pte_finetuning/TARGETS b/examples/llm_pte_finetuning/TARGETS
new file mode 100644
index 00000000000..fee67914909
--- /dev/null
+++ b/examples/llm_pte_finetuning/TARGETS
@@ -0,0 +1,70 @@
+load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary")
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+
+oncall("papaya_oncall")
+
+python_library(
+    name = "model_loading_lib",
+    srcs = [
+        "model_loading_lib.py",
+    ],
+    deps = [
+        "fbcode//caffe2:torch",
+        "fbcode//executorch/examples/llm_pte_finetuning:training_lib",
+        "fbcode//executorch/exir:lib",
+        "fbcode//executorch/extension/pybindings:aten_lib",  # @manual For PTE loader
+        "fbcode//pytorch/torchtune:lib",
+        "fbsource//third-party/pypi/blobfile:blobfile",  # @manual For tokenizer
+        "fbsource//third-party/pypi/omegaconf:omegaconf",
+        "fbsource//third-party/pypi/tiktoken:tiktoken",  # @manual For tokenizer
+    ],
+)
+
+python_library(
+    name = "training_lib",
+    srcs = [
+        "training_lib.py",
+    ],
+    deps = [
+        "fbcode//caffe2:torch",
+        "fbcode//executorch/extension/pybindings:aten_lib",  # @manual For PTE loader
+        "fbcode//pytorch/torchtune:lib",
+        "fbsource//third-party/pypi/blobfile:blobfile",  # @manual For tokenizer
+        "fbsource//third-party/pypi/tiktoken:tiktoken",  # @manual For tokenizer
+        "fbsource//third-party/pypi/tqdm:tqdm",
+    ],
+)
+
+python_binary(
+    name = "runner",
+    srcs = [
+        "runner.py",
+    ],
+    main_function = "executorch.examples.llm_pte_finetuning.runner.main",
+    deps = [
+        "fbcode//caffe2:torch",
+        "fbcode//executorch/examples/llm_pte_finetuning:training_lib",
+        "fbcode//pytorch/torchtune:lib",
+        "fbsource//third-party/pypi/blobfile:blobfile",  # @manual For tokenizer
+        "fbsource//third-party/pypi/omegaconf:omegaconf",
+        "fbsource//third-party/pypi/tiktoken:tiktoken",  # @manual For tokenizer
+        "fbsource//third-party/pypi/tqdm:tqdm",
+    ],
+)
+
+python_binary(
+    name = "model_exporter",
+    srcs = [
+        "model_exporter.py",
+    ],
+    main_function = "executorch.examples.llm_pte_finetuning.model_exporter.main",
+    deps = [
+        "fbcode//caffe2:torch",
+        "fbcode//executorch/examples/llm_pte_finetuning:model_loading_lib",  # @manual for model loading
+        "fbcode//executorch/examples/llm_pte_finetuning:training_lib",  # @manual for model exporting
+        "fbcode//pytorch/torchtune:lib",
+        "fbsource//third-party/pypi/blobfile:blobfile",  # @manual For tokenizer
+        "fbsource//third-party/pypi/omegaconf:omegaconf",
+        "fbsource//third-party/pypi/tiktoken:tiktoken",  # @manual For tokenizer
+    ],
+)
diff --git a/examples/llm_pte_finetuning/model_exporter.py b/examples/llm_pte_finetuning/model_exporter.py
new file mode 100644
index 00000000000..e7f074c8769
--- /dev/null
+++ b/examples/llm_pte_finetuning/model_exporter.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import argparse
+
+import torch
+from executorch.examples.llm_pte_finetuning.model_loading_lib import (
+    export_model_lora_training,
+    load_checkpoint,
+    setup_model,
+)
+
+from executorch.examples.llm_pte_finetuning.training_lib import (
+    get_dataloader,
+    TrainingModule,
+)
+
+from omegaconf import OmegaConf
+from torch.nn import functional as F
+from torchtune import config
+
+from torchtune.training import MODEL_KEY
+
+parser = argparse.ArgumentParser(
+    prog="ModelExporter",
+    description="Export a LoRA model to ExecuTorch.",
+    epilog="Model exported to be used for fine-tuning.",
+)
+
+parser.add_argument("--cfg", type=str, help="Path to the config file.")
+parser.add_argument("--output_file", type=str, help="Path to the output ET model.")
+
+
+def main() -> None:
+    args = parser.parse_args()
+    config_file = args.cfg
+    output_file = args.output_file
+    cfg = OmegaConf.load(config_file)
+    tokenizer = config.instantiate(
+        cfg.tokenizer,
+    )
+
+    loss_fn = config.instantiate(cfg.loss)
+
+    ds = config.instantiate(cfg.dataset, tokenizer)
+    train_set, val_set = torch.utils.data.random_split(ds, [0.8, 0.2])
+    train_dataloader = get_dataloader(cfg, train_set, tokenizer, loss_fn)
+
+    max_seq_len = cfg.tokenizer.max_seq_len
+
+    # Example inputs, needed for ET export.
+    batch = next(iter(train_dataloader))
+    tokens, labels = batch["tokens"], batch["labels"]
+    token_size = tokens.shape[1]
+    labels_size = labels.shape[1]
+
+    if token_size > max_seq_len:
+        tokens = tokens[:, :max_seq_len]
+    else:
+        tokens = F.pad(tokens, (0, max_seq_len - token_size), value=0)
+
+    if labels_size > max_seq_len:
+        labels = labels[:, :max_seq_len]
+    else:
+        labels = F.pad(labels, (0, max_seq_len - labels_size), value=0)
+
+    # Load pre-trained checkpoint.
+    checkpoint_dict = load_checkpoint(cfg=cfg)
+    model = setup_model(
+        # pyre-ignore
+        cfg=cfg,
+        base_model_state_dict=checkpoint_dict[MODEL_KEY],
+    )
+
+    training_module = TrainingModule(model, loss_fn)
+
+    # Export the model to ExecuTorch for training.
+    export_model_lora_training(training_module, (tokens, labels), output_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/llm_pte_finetuning/model_loading_lib.py b/examples/llm_pte_finetuning/model_loading_lib.py
new file mode 100644
index 00000000000..3372a97e269
--- /dev/null
+++ b/examples/llm_pte_finetuning/model_loading_lib.py
@@ -0,0 +1,88 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from typing import Any, Dict, Tuple
+
+import torch
+from executorch.examples.llm_pte_finetuning.training_lib import TrainingModule
+from executorch.exir import to_edge
+
+from omegaconf import DictConfig
+from torch.export import export, ExportedProgram
+from torch.export.experimental import _export_forward_backward
+from torch.nn.attention import sdpa_kernel, SDPBackend
+from torchtune import config
+from torchtune.modules.peft import get_adapter_params, set_trainable_params
+from torchtune.training.precision import get_dtype, set_default_dtype
+from torchtune.utils._device import get_device
+
+
+def load_checkpoint(cfg: Any) -> Dict[str, Any]:  # pyre-ignore[2]
+    """
+    Extract the checkpoint state from file and validate. This includes the
+    base model weights. If resume_from_checkpoint is True, this also includes
+    the adapter weights and recipe state
+    """
+    checkpointer = config.instantiate(
+        cfg.checkpointer,
+        resume_from_checkpoint=cfg.resume_from_checkpoint,
+    )
+    checkpoint_dict = checkpointer.load_checkpoint()
+    return checkpoint_dict
+
+
+def setup_model(
+    cfg: DictConfig,
+    base_model_state_dict: Dict[str, Any],
+) -> torch.nn.Module:
+    device = get_device(device=cfg.device)
+    dtype = get_dtype(cfg.dtype, device=device)
+    with set_default_dtype(dtype), device:
+        model = config.instantiate(cfg.model)
+
+    adapter_params = get_adapter_params(model)
+    set_trainable_params(model, adapter_params)
+    model.load_state_dict(base_model_state_dict, strict=False)
+    return model
+
+
+def export_model_lora_training(
+    model: TrainingModule,
+    example_args: Tuple[Any, ...],  # pyre-ignore[2]
+    output_file: str,
+) -> None:
+    """
+    Export model with LoRA model to executorch for training, only.
+    """
+
+    # 0. Mark the LoRA layers as trainable (requires_grad = True) in order
+    # to just export the backwards pass for these layers later in the
+    # export process.
+    set_trainable_params(model, get_adapter_params(model))
+
+    print("Exporting model with LoRA for training")
+    # 1. torch.export: Defines the program with the ATen operator set.
+
+    with sdpa_kernel([SDPBackend.MATH]):
+        exported_graph: ExportedProgram = export(model, example_args, strict=False)
+        print("Creating a joint forward-backwards graph for training")
+        joint_graph = _export_forward_backward(exported_graph)
+
+        # 2. to_edge: Make optimizations for Edge devices.
+        print("Lowering to edge dialect")
+        edge_program = to_edge(joint_graph)
+
+        print(edge_program._edge_programs["forward"].graph_module)
+
+    # 3. to_executorch: Convert the graph to an ExecuTorch program.
+    print("Exporting to executorch")
+    executorch_program = edge_program.to_executorch()
+    print(executorch_program.exported_program().graph_signature)
+    print(f"Saving to {output_file}")
+    with open(output_file, "wb") as file:
+        file.write(executorch_program.buffer)
diff --git a/examples/llm_pte_finetuning/phi3_alpaca_code_config.yaml b/examples/llm_pte_finetuning/phi3_alpaca_code_config.yaml
new file mode 100644
index 00000000000..88e5bfac700
--- /dev/null
+++ b/examples/llm_pte_finetuning/phi3_alpaca_code_config.yaml
@@ -0,0 +1,49 @@
+tokenizer:
+  _component_: torchtune.models.phi3.phi3_mini_tokenizer
+  path: /tmp/Phi-3-mini-4k-instruct/tokenizer.model
+  max_seq_len: 1024
+
+dataset:
+  _component_: torchtune.datasets.instruct_dataset
+  template: papaya.toolkit.experimental.llm_pte_finetuning.utils.DatabricksDolly
+  source: iamtarun/python_code_instructions_18k_alpaca
+  split: train
+  column_map:
+    instruction: instruction
+    prompt: prompt
+    input: input
+    output: output
+seed: null
+shuffle: True
+batch_size: 1
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+model:
+  _component_: torchtune.models.phi3.lora_phi3_mini
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Phi-3-mini-4k-instruct
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Phi-3-mini-4k-instruct/
+  model_type: PHI3_MINI
+
+resume_from_checkpoint: False
+save_adapter_weights_only: False
+
+device: cpu
+dtype: fp32
+
+enable_activation_checkpointing: True
+compile: False
diff --git a/examples/llm_pte_finetuning/phi3_config.yaml b/examples/llm_pte_finetuning/phi3_config.yaml
new file mode 100644
index 00000000000..7417ece79bd
--- /dev/null
+++ b/examples/llm_pte_finetuning/phi3_config.yaml
@@ -0,0 +1,40 @@
+tokenizer:
+  _component_: torchtune.models.phi3.phi3_mini_tokenizer
+  path: /tmp/Phi-3-mini-4k-instruct/tokenizer.model
+  max_seq_len: 512
+
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+seed: null
+shuffle: True
+batch_size: 1
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+model:
+  _component_: torchtune.models.phi3.lora_phi3_mini
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Phi-3-mini-4k-instruct
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Phi-3-mini-4k-instruct/
+  model_type: PHI3_MINI
+resume_from_checkpoint: False
+save_adapter_weights_only: False
+
+device: cpu
+dtype: fp32
+
+enable_activation_checkpointing: True
+compile: False
diff --git a/examples/llm_pte_finetuning/qwen_05b_config.yaml b/examples/llm_pte_finetuning/qwen_05b_config.yaml
new file mode 100644
index 00000000000..b93517b8fda
--- /dev/null
+++ b/examples/llm_pte_finetuning/qwen_05b_config.yaml
@@ -0,0 +1,39 @@
+tokenizer:
+  _component_: torchtune.models.qwen2.qwen2_tokenizer
+  path: /tmp/Qwen2-0.5B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2-0.5B-Instruct/merges.txt
+  max_seq_len: 512
+
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+seed: null
+shuffle: True
+batch_size: 1
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+model:
+  _component_: torchtune.models.qwen2.lora_qwen2_0_5b
+  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  lora_rank: 32
+  lora_alpha: 64
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2-0.5B-Instruct
+  checkpoint_files: [
+    model.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2-0.5B-Instruct
+  model_type: QWEN2
+resume_from_checkpoint: False
+save_adapter_weights_only: False
+
+device: cpu
+dtype: fp32
+
+enable_activation_checkpointing: True
+compile: False
diff --git a/examples/llm_pte_finetuning/runner.py b/examples/llm_pte_finetuning/runner.py
new file mode 100644
index 00000000000..2e01fdafe8d
--- /dev/null
+++ b/examples/llm_pte_finetuning/runner.py
@@ -0,0 +1,118 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import argparse
+
+import torch
+from executorch.examples.llm_pte_finetuning.training_lib import (
+    eval_model,
+    get_dataloader,
+    update_function,
+)
+
+from executorch.extension.pybindings.aten_lib import (  # @manual
+    _load_for_executorch_from_buffer,
+)
+from omegaconf import OmegaConf
+from torch.nn import functional as F
+from torchtune import config
+from tqdm import tqdm
+
+parser = argparse.ArgumentParser(
+    prog="Runner",
+    description="Fine tunes LoRA model using ExecuTorch.",
+    epilog="Model exported to be used for fine-tuning.",
+)
+parser.add_argument("--cfg", type=str, help="Path to the config file.")
+parser.add_argument("--model_file", type=str, help="Path to the ET model file.")
+
+
+def main() -> None:
+    args = parser.parse_args()
+    config_file = args.cfg
+    file = args.model_file
+    cfg = OmegaConf.load(config_file)
+    tokenizer = config.instantiate(
+        cfg.tokenizer,
+    )
+
+    loss_fn = config.instantiate(cfg.loss)
+
+    ds = config.instantiate(cfg.dataset, tokenizer)
+    train_set, val_set = torch.utils.data.random_split(ds, [0.8, 0.2])
+    train_dataloader = get_dataloader(cfg, train_set, tokenizer, loss_fn)
+    val_dataloader = get_dataloader(cfg, val_set, tokenizer, loss_fn)
+
+    max_seq_len = cfg.tokenizer.max_seq_len
+    # Num of steps to run training. Assume 1 epoch
+    num_steps = 100
+    with open(file, "rb") as f:
+        model_bytes = f.read()
+        et_mod = _load_for_executorch_from_buffer(model_bytes)
+
+        # Evaluate the model before training.
+        print("Evaluating the model before training")
+        eval_loss = eval_model(
+            model=et_mod,
+            dataloader=val_dataloader,
+            loss_fn=loss_fn,
+            max_seq_len=max_seq_len,
+            num_eval_steps=10,
+        )
+        print("Eval loss: ", eval_loss)
+
+        # Based on executorch/extension/training/module/training_module.cpp
+        # grads run from [grad_start, param_start]
+        # params run from [param_start, outputs_end]
+        grad_start = et_mod.run_method("__et_training_gradients_index_forward", [])[0]
+        param_start = et_mod.run_method("__et_training_parameters_index_forward", [])[0]
+        learning_rate = 5e-3
+        f.seek(0)
+        losses = []
+        for i, batch in tqdm(enumerate(train_dataloader), total=num_steps):
+            # Run for a limited number of steps.
+            if i >= num_steps:
+                break
+            tokens, labels = batch["tokens"], batch["labels"]
+            token_size = tokens.shape[1]
+            labels_size = labels.shape[1]
+
+            # Fixed length for now. We need to resize as the input shapes
+            # should be the same passed as examples to the export function.
+            if token_size > max_seq_len:
+                tokens = tokens[:, :max_seq_len]
+            else:
+                tokens = F.pad(tokens, (0, max_seq_len - token_size), value=0)
+
+            if labels_size > max_seq_len:
+                labels = labels[:, :max_seq_len]
+            else:
+                labels = F.pad(labels, (0, max_seq_len - labels_size), value=0)
+
+            out = et_mod.forward((tokens, labels))
+
+            loss = out[0]
+            losses.append(loss.item())
+            with torch.no_grad():
+                for grad, param in zip(out[grad_start:param_start], out[param_start:]):
+                    update_function(param, grad, learning_rate)
+
+        print("Losses: ", losses)
+        # Evaluate the model after training.
+        eval_loss = eval_model(
+            model=et_mod,
+            dataloader=val_dataloader,
+            loss_fn=loss_fn,
+            max_seq_len=max_seq_len,
+            num_eval_steps=10,
+        )
+    print("Eval loss: ", eval_loss)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/llm_pte_finetuning/training_lib.py b/examples/llm_pte_finetuning/training_lib.py
new file mode 100644
index 00000000000..6324d93814e
--- /dev/null
+++ b/examples/llm_pte_finetuning/training_lib.py
@@ -0,0 +1,170 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from functools import partial
+from typing import Any, Dict, Mapping, Optional
+
+import torch
+from executorch.extension.pybindings.aten_lib import ExecuTorchModule  # @manual
+
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, Dataset, DistributedSampler
+from torchtune.data import InstructTemplate
+from torchtune.data._collate import padded_collate_sft
+from tqdm import tqdm
+
+
+class TrainingModule(torch.nn.Module):
+    """
+    The model being trained should return the loss from forward(). This
+    class wraps the actual model and computes the loss for an LLM
+    fine-tuning task. The loss is computed as the cross entropy between
+    the tokens and a shifted version of the labels so we learn to predict
+    the next token.
+    """
+
+    def __init__(
+        self, model: torch.nn.Module, loss: torch.nn.modules.loss._Loss
+    ) -> None:
+        super().__init__()
+        self.model = model
+        self.loss = loss
+
+    def forward(self, input: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+        # Output is of the shape (seq_len, vocab_size).
+        logits = self.model(input)
+        logits = logits[..., :-1, :].contiguous()
+        labels = labels[..., 1:].contiguous()
+        logits = logits.transpose(1, 2)
+        return self.loss(logits, labels)
+
+
+class DatabricksDolly(InstructTemplate):
+    """
+    Used for the Dolly dataset from Databricks.
+
+    https://huggingface.co/datasets/databricks/databricks-dolly-15k
+    """
+
+    template = "Instruction:\n{instruction}\n\nContext:\n{input}\n\nResponse: "
+
+    @classmethod
+    def format(
+        cls,
+        sample: Mapping[str, Any],
+        column_map: Optional[Dict[str, str]],
+    ) -> str:
+        assert column_map is not None
+        instruction = sample[column_map["instruction"]]
+        input = sample[column_map["input"]]
+        return cls.template.format(instruction=instruction, input=input)
+
+
+class PythonCodeInstructions(InstructTemplate):
+    """
+    https://huggingface.co/datasets/iamtarun/python_code_instructions_18k_alpaca
+    """
+
+    template = (
+        "{prompt}\n\n"
+        "Instruction:\n{instruction}"
+        "\n\nContext:\n{input}\n\nResponse: "
+    )
+
+    @classmethod
+    def format(
+        cls,
+        sample: Mapping[str, Any],
+        column_map: Optional[Dict[str, str]],
+    ) -> str:
+        assert column_map is not None
+        instruction = sample[column_map["instruction"]]
+        input = sample[column_map["input"]]
+        prompt = sample[column_map["prompt"]]
+        return cls.template.format(instruction=instruction, input=input, prompt=prompt)
+
+
+def update_function(
+    param: torch.Tensor,
+    grad: torch.Tensor,
+    learning_rate: float,
+    weight_decay: float = 1.0,
+) -> None:
+    """SGD update function."""
+    grad = grad + weight_decay * param
+    param.sub_(learning_rate * grad)
+
+
+def eval_model(
+    model: ExecuTorchModule,
+    dataloader: DataLoader,
+    loss_fn: torch.nn.modules.loss._Loss,
+    max_seq_len: int,
+    num_eval_steps: int,
+) -> float:
+    total_loss = 0
+    for i, batch in tqdm(enumerate(dataloader), total=num_eval_steps):
+        if i >= num_eval_steps:
+            break
+        tokens, labels = batch["tokens"], batch["labels"]
+        token_size = tokens.shape[1]
+        labels_size = labels.shape[1]
+
+        tokens, labels = batch["tokens"], batch["labels"]
+        token_size = tokens.shape[1]
+        labels_size = labels.shape[1]
+
+        # Fixed length for now. We need to resize as the input shapes
+        # should be the same passed as examples to the export function.
+        if token_size > max_seq_len:
+            tokens = tokens[:, :max_seq_len]
+        else:
+            tokens = F.pad(tokens, (0, max_seq_len - token_size), value=0)
+
+        if labels_size > max_seq_len:
+            labels = labels[:, :max_seq_len]
+        else:
+            labels = F.pad(labels, (0, max_seq_len - labels_size), value=0)
+
+        out = model.forward((tokens, labels))
+        loss = out[0]
+        total_loss += loss
+    return total_loss / num_eval_steps
+
+
+def get_dataloader(
+    cfg: Any,  # pyre-ignore[2]
+    ds: Dataset[Any],  # pyre-ignore[2]
+    tokenizer: Any,  # pyre-ignore[2]
+    loss_fn: torch.nn.modules.loss._Loss,
+) -> DataLoader:
+    """Given a dataset, tokenizer, and loss function, return a dataloader."""
+    packed = cfg.dataset.get("packed", False)
+
+    sampler = DistributedSampler(
+        ds,
+        num_replicas=1,
+        rank=0,
+        shuffle=cfg.shuffle,
+        seed=0,
+    )
+    dataloader = DataLoader(
+        dataset=ds,
+        sampler=sampler,
+        batch_size=cfg.batch_size,
+        collate_fn=(
+            partial(
+                padded_collate_sft,
+                padding_idx=tokenizer.pad_id,
+                ignore_idx=loss_fn.ignore_index,
+            )
+            if not packed
+            else None
+        ),
+    )
+    return dataloader
diff --git a/examples/mediatek/CMakeLists.txt b/examples/mediatek/CMakeLists.txt
index 1018477ec85..1d411f07ca7 100644
--- a/examples/mediatek/CMakeLists.txt
+++ b/examples/mediatek/CMakeLists.txt
@@ -1,8 +1,8 @@
- # Copyright (c) 2024 MediaTek Inc.
- #
- # Licensed under the BSD License (the "License"); you may not use this file
- # except in compliance with the License. See the license file in the root
- # directory of this source tree for more details.
+# Copyright (c) 2024 MediaTek Inc.
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file in the root
+# directory of this source tree for more details.
 
 cmake_minimum_required(VERSION 3.19)
 project(mediatek_example)
@@ -49,117 +49,131 @@ find_package(gflags REQUIRED)
 link_directories(${EXECUTORCH_ROOT}/cmake-android-out/lib)
 
 if(${ANDROID})
-    message("Build MTK Android Examples")
-
-    get_filename_component(EXECUTORCH_SOURCE_DIR
-        "${CMAKE_CURRENT_LIST_DIR}/../.."
-        ABSOLUTE
-    )
-    set(_mtk_executor_runner__srcs ${_executor_runner__srcs})
-    list(
-        TRANSFORM
-        _mtk_executor_runner__srcs
-        PREPEND
-        "${EXECUTORCH_SOURCE_DIR}/"
-    )
-    list(
-        FILTER
-        _mtk_executor_runner__srcs
-        EXCLUDE REGEX
-        ".*executor_runner.cpp$"
-    )
-    list(
-        PREPEND
-        _mtk_executor_runner__srcs
-        ${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_executor_runner.cpp
-    )
-
-    add_executable(mtk_executor_runner ${_mtk_executor_runner__srcs})
-
-    target_include_directories(mtk_executor_runner
-        PUBLIC
-        ${_common_include_directories}
-        ${EXECUTORCH_ROOT}/cmake-android-out/third-party/gflags/include
-    )
-
-    target_link_libraries(mtk_executor_runner
-        ${_executor_runner_libs}
-        executorch
-        neuron_backend
-        gflags
-    )
-    target_compile_options(mtk_executor_runner
-        PUBLIC
-        ${_common_compile_options}
-    )
-
-    set(_mtk_llama_executor_runner__srcs ${_mtk_executor_runner__srcs})
-    list(
-        FILTER
-        _mtk_llama_executor_runner__srcs
-        EXCLUDE REGEX
-        ".*executor_runner.cpp$"
-    )
-    list(
-        PREPEND
-        _mtk_llama_executor_runner__srcs
-        ${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_llama_executor_runner.cpp
-    )
-
-    # Build ABSL and RE2
-    set(LLAMA2_EXAMPLE_MODEL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../examples/models/llama2)
-    set(THIRD_PARTY_ABSL_DIR ${LLAMA2_EXAMPLE_MODEL_DIR}/third-party/abseil-cpp)
-    set(THIRD_PARTY_RE2_DIR ${LLAMA2_EXAMPLE_MODEL_DIR}/third-party/re2)
-    set(ABSL_ENABLE_INSTALL ON)
-    set(ABSL_PROPAGATE_CXX_STD ON)
-    set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
-    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-    add_subdirectory(${THIRD_PARTY_ABSL_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/abseil)
-    add_subdirectory(${THIRD_PARTY_RE2_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/re2)
-    set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
-
-    # Build tokenizers
-    set(LLAMA2_TOKENIZER_DIR ${LLAMA2_EXAMPLE_MODEL_DIR}/tokenizer)
-    add_library(tokenizer STATIC)
-    target_include_directories(tokenizer
-        PUBLIC
-        ${_common_include_directories}
-        ${THIRD_PARTY_ABSL_DIR}
-        ${THIRD_PARTY_RE2_DIR}
-    )
-    target_link_libraries(tokenizer
-        PRIVATE
-        re2::re2
-    )
-    target_sources(tokenizer
-        PRIVATE
-        ${LLAMA2_TOKENIZER_DIR}/tiktoken.cpp
-        ${LLAMA2_TOKENIZER_DIR}/bpe_tokenizer.cpp
-    )
-
-    # Include directory for neuron headers
-    include_directories(
-        BEFORE
-        ${_common_include_directories}
-        ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/mediatek/runtime/include
-    )
-
-    # Build Llama Executor static library
-    add_subdirectory(executor_runner/llama_runner)
-
-    # Build Llama Executor Runner
-    add_executable(mtk_llama_executor_runner ${_mtk_llama_executor_runner__srcs})
-
-    target_link_libraries(mtk_llama_executor_runner
-        ${_executor_runner_libs}
-        ${NEURON_BUFFER_ALLOCATOR_LIB}
-        neuron_backend
-        gflags
-        mtk_llama_executor_lib
-        tokenizer
-    )
-    target_compile_options(mtk_llama_executor_runner
-        PUBLIC
-        ${_common_compile_options}
-    )
+  message("Build MTK Android Examples")
+
+  get_filename_component(
+    EXECUTORCH_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../.." ABSOLUTE
+  )
+  set(_mtk_executor_runner__srcs ${_executor_runner__srcs})
+  list(TRANSFORM _mtk_executor_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
+  list(FILTER _mtk_executor_runner__srcs EXCLUDE REGEX ".*executor_runner.cpp$")
+  list(PREPEND _mtk_executor_runner__srcs
+       ${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_executor_runner.cpp
+  )
+
+  add_executable(mtk_executor_runner ${_mtk_executor_runner__srcs})
+
+  target_include_directories(
+    mtk_executor_runner
+    PUBLIC ${_common_include_directories}
+           ${EXECUTORCH_ROOT}/cmake-android-out/third-party/gflags/include
+  )
+
+  target_link_libraries(
+    mtk_executor_runner ${_executor_runner_libs} executorch neuron_backend
+    gflags
+  )
+  target_compile_options(mtk_executor_runner PUBLIC ${_common_compile_options})
+
+  set(_mtk_oss_executor_runner__srcs ${_executor_runner__srcs})
+  list(
+      TRANSFORM
+      _mtk_oss_executor_runner__srcs
+      PREPEND
+      "${EXECUTORCH_SOURCE_DIR}/"
+  )
+  list(
+      FILTER
+      _mtk_oss_executor_runner__srcs
+      EXCLUDE REGEX
+      ".*executor_runner.cpp$"
+  )
+  list(
+      PREPEND
+      _mtk_oss_executor_runner__srcs
+      ${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_oss_executor_runner.cpp
+  )
+
+  add_executable(mtk_oss_executor_runner ${_mtk_oss_executor_runner__srcs})
+
+  target_include_directories(mtk_oss_executor_runner
+      PUBLIC
+      ${_common_include_directories}
+      ${EXECUTORCH_ROOT}/cmake-android-out/third-party/gflags/include
+  )
+
+  target_link_libraries(mtk_oss_executor_runner
+      ${_executor_runner_libs}
+      executorch
+      neuron_backend
+      gflags
+  )
+  target_compile_options(mtk_oss_executor_runner
+      PUBLIC
+      ${_common_compile_options}
+  )
+
+  set(_mtk_llama_executor_runner__srcs ${_mtk_executor_runner__srcs})
+  list(FILTER _mtk_llama_executor_runner__srcs EXCLUDE REGEX
+       ".*executor_runner.cpp$"
+  )
+  list(PREPEND _mtk_llama_executor_runner__srcs
+       ${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_llama_executor_runner.cpp
+  )
+  # Build ABSL and RE2
+  set(EXTENSIONS_LLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm)
+  set(THIRD_PARTY_ABSL_DIR ${EXTENSIONS_LLM_DIR}/third-party/abseil-cpp)
+  set(THIRD_PARTY_RE2_DIR ${EXTENSIONS_LLM_DIR}/third-party/re2)
+  set(ABSL_ENABLE_INSTALL ON)
+  set(ABSL_PROPAGATE_CXX_STD ON)
+  set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+  add_subdirectory(
+    ${THIRD_PARTY_ABSL_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/abseil
+  )
+  add_subdirectory(
+    ${THIRD_PARTY_RE2_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/re2
+  )
+  set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
+
+  # Build tokenizers
+  set(LLAMA2_TOKENIZER_DIR ${EXTENSIONS_LLM_DIR}/tokenizer)
+  add_library(tokenizer STATIC)
+  target_include_directories(
+    tokenizer PUBLIC ${_common_include_directories} ${THIRD_PARTY_ABSL_DIR}
+                     ${THIRD_PARTY_RE2_DIR}
+  )
+  target_link_libraries(tokenizer PRIVATE re2::re2)
+  target_sources(
+    tokenizer
+    PRIVATE
+      ${LLAMA2_TOKENIZER_DIR}/tiktoken.cpp
+      ${LLAMA2_TOKENIZER_DIR}/bpe_tokenizer.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/../models/llama2/tokenizer/llama_tiktoken.cpp
+  )
+
+  # Include directory for neuron headers
+  include_directories(
+    BEFORE ${_common_include_directories}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/mediatek/runtime/include
+  )
+
+  # Build Llama Executor static library
+  add_subdirectory(executor_runner/llama_runner)
+
+  # Build Llama Executor Runner
+  add_executable(mtk_llama_executor_runner ${_mtk_llama_executor_runner__srcs})
+
+  target_link_libraries(
+    mtk_llama_executor_runner
+    ${_executor_runner_libs}
+    ${NEURON_BUFFER_ALLOCATOR_LIB}
+    neuron_backend
+    gflags
+    mtk_llama_executor_lib
+    tokenizer
+  )
+  target_compile_options(
+    mtk_llama_executor_runner PUBLIC ${_common_compile_options}
+  )
 endif()
diff --git a/examples/mediatek/README.md b/examples/mediatek/README.md
index faca42fb50c..9727f2587fd 100644
--- a/examples/mediatek/README.md
+++ b/examples/mediatek/README.md
@@ -9,6 +9,8 @@ examples/mediatek
         ├── preformatter_templates    # Model specific prompt preformatter templates
         ├── prompts                   # Calibration Prompts
         ├── tokenizers_               # Model tokenizer scripts
+    ├── oss_utils                     # Utils for oss models
+├── eval_utils                        # Utils for eval oss models
 ├── model_export_scripts              # Model specifc export scripts
 ├── models                            # Model definitions
     ├── llm_models                    # LLM model definitions
@@ -44,6 +46,7 @@ pip3 install mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylin
 ```
 
 ## AoT Flow
+### llama
 ##### Note: Verify that localhost connection is available before running AoT Flow
 1. Exporting Models to `.pte`
 - In the `examples/mediatek directory`, run:
@@ -72,6 +75,14 @@ source shell_scripts/export_llama.sh <model_name> <num_chunks> <prompt_num_token
     - eg. For `llama3-8B-instruct`, embedding bin generated in `examples/mediatek/models/llm_models/weights/llama3-8B-instruct/`
     - AoT flow will take roughly 2.5 hours (114GB RAM for `num_chunks=4`) to complete (Results will vary by device/hardware configurations)
 
+### oss
+1. Exporting Model to `.pte`
+```bash
+bash shell_scripts/export_oss.sh <model_name>
+```
+- Argument Options:
+    - `model_name`: deeplabv3/edsr/inceptionv3/inceptionv4/mobilenetv2/mobilenetv3/resnet18/resnet50
+
 # Runtime
 ## Supported Chips
 
@@ -100,6 +111,13 @@ adb push <MODEL_NAME>.pte <PHONE_PATH, e.g. /data/local/tmp>
 
 Make sure to replace `<MODEL_NAME>` with the actual name of your model file. And, replace the `<PHONE_PATH>` with the desired detination on the device.
 
+##### Note: For oss models, please push additional files to your Android device
+```bash
+adb push mtk_oss_executor_runner <PHONE_PATH, e.g. /data/local/tmp>
+adb push input_list.txt <PHONE_PATH, e.g. /data/local/tmp>
+for i in input*bin; do adb push "$i" <PHONE_PATH, e.g. /data/local/tmp>; done;
+```
+
 ### Executing the Model
 
 Execute the model on your Android device by running:
@@ -111,3 +129,21 @@ adb shell "/data/local/tmp/mtk_executor_runner --model_path /data/local/tmp/<MOD
 In the command above, replace `<MODEL_NAME>` with the name of your model file and `<ITER_TIMES>` with the desired number of iterations to run the model.
 
 ##### Note: For llama models, please use `mtk_llama_executor_runner`. Refer to `examples/mediatek/executor_runner/run_llama3_sample.sh` for reference.
+##### Note: For oss models, please use `mtk_oss_executor_runner`.
+```bash
+adb shell "/data/local/tmp/mtk_oss_executor_runner --model_path /data/local/tmp/<MODEL_NAME>.pte --input_list /data/local/tmp/input_list.txt --output_folder /data/local/tmp/output_<MODEL_NAME>"
+adb pull "/data/local/tmp/output_<MODEL_NAME> ./"
+```
+
+### Check oss result on PC
+```bash
+python3 eval_utils/eval_oss_result.py --eval_type <eval_type> --target_f <golden_folder> --output_f <prediction_folder>
+```
+For example:
+```
+python3 eval_utils/eval_oss_result.py --eval_type piq --target_f edsr --output_f output_edsr
+```
+- Argument Options:
+    - `eval_type`: topk/piq/segmentation
+    - `target_f`: folder contain golden data files. file name is `golden_<data_idx>_0.bin`
+    - `output_f`: folder contain model output data files. file name is `output_<data_idx>_0.bin`
diff --git a/examples/mediatek/aot_utils/oss_utils/utils.py b/examples/mediatek/aot_utils/oss_utils/utils.py
new file mode 100755
index 00000000000..f447b2ac68f
--- /dev/null
+++ b/examples/mediatek/aot_utils/oss_utils/utils.py
@@ -0,0 +1,73 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from typing import Optional
+
+import torch
+from executorch import exir
+from executorch.backends.mediatek import (
+    NeuropilotPartitioner,
+    NeuropilotQuantizer,
+    Precision,
+)
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+
+def build_executorch_binary(
+    model,
+    inputs,
+    file_name,
+    dataset,
+    quant_dtype: Optional[Precision] = None,
+):
+    if quant_dtype is not None:
+        quantizer = NeuropilotQuantizer()
+        quantizer.setup_precision(quant_dtype)
+        if quant_dtype not in Precision:
+            raise AssertionError(f"No support for Precision {quant_dtype}.")
+
+        captured_model = torch._export.capture_pre_autograd_graph(model, inputs)
+        annotated_model = prepare_pt2e(captured_model, quantizer)
+        print("Quantizing the model...")
+        # calibration
+        for data in dataset:
+            annotated_model(*data)
+        quantized_model = convert_pt2e(annotated_model, fold_quantize=False)
+        aten_dialect = torch.export.export(quantized_model, inputs)
+    else:
+        aten_dialect = torch.export.export(model, inputs)
+
+    from executorch.exir.program._program import to_edge_transform_and_lower
+
+    edge_compile_config = exir.EdgeCompileConfig(_check_ir_validity=False)
+    # skipped op names are used for deeplabV3 model
+    neuro_partitioner = NeuropilotPartitioner(
+        [],
+        op_names_to_skip={
+            "aten_convolution_default_106",
+            "aten_convolution_default_107",
+        },
+    )
+    edge_prog = to_edge_transform_and_lower(
+        aten_dialect,
+        compile_config=edge_compile_config,
+        partitioner=[neuro_partitioner],
+    )
+
+    exec_prog = edge_prog.to_executorch(
+        config=exir.ExecutorchBackendConfig(extract_constant_segment=False)
+    )
+    with open(f"{file_name}.pte", "wb") as file:
+        file.write(exec_prog.buffer)
+
+
+def make_output_dir(path: str):
+    if os.path.exists(path):
+        for f in os.listdir(path):
+            os.remove(os.path.join(path, f))
+        os.removedirs(path)
+    os.makedirs(path)
diff --git a/examples/mediatek/eval_utils/eval_oss_result.py b/examples/mediatek/eval_utils/eval_oss_result.py
new file mode 100755
index 00000000000..3e599330b66
--- /dev/null
+++ b/examples/mediatek/eval_utils/eval_oss_result.py
@@ -0,0 +1,198 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import json
+import os
+
+import numpy as np
+import piq
+import torch
+
+
+def check_data(target_f, predict_f):
+    target_files = os.listdir(target_f)
+    predict_files = os.listdir(predict_f)
+    if len(target_files) != len(predict_files):
+        raise RuntimeError(
+            "Data number in target folder and prediction folder must be same"
+        )
+
+    predict_set = set(predict_files)
+    for f in target_files:
+        # target file naming rule is golden_sampleId_outId.bin
+        # predict file naming rule is output_sampleId_outId.bin
+        pred_name = f.replace("golden", "output")
+        try:
+            predict_set.remove(pred_name)
+        except KeyError:
+            raise RuntimeError(f"Cannot find {pred_name} in {predict_f}")
+
+    if predict_set:
+        target_name = next(predict_set).replace("output", "golden")
+        raise RuntimeError(f"Cannot find {target_name} in {target_f}")
+
+
+def eval_topk(target_f, predict_f):
+    def solve(prob, target, k):
+        _, indices = torch.topk(prob, k=k, sorted=True)
+        golden = torch.reshape(target, [-1, 1])
+        correct = golden == indices
+        if torch.any(correct):
+            return 1
+        else:
+            return 0
+
+    target_files = os.listdir(target_f)
+
+    cnt10 = 0
+    cnt50 = 0
+    for target_name in target_files:
+        pred_name = target_name.replace("golden", "output")
+
+        pred_npy = np.fromfile(os.path.join(predict_f, pred_name), dtype=np.float32)
+        target_npy = np.fromfile(os.path.join(target_f, target_name), dtype=np.int64)[0]
+        cnt10 += solve(torch.from_numpy(pred_npy), torch.from_numpy(target_npy), 10)
+        cnt50 += solve(torch.from_numpy(pred_npy), torch.from_numpy(target_npy), 50)
+
+    print("Top10 acc:", cnt10 * 100.0 / len(target_files))
+    print("Top50 acc:", cnt50 * 100.0 / len(target_files))
+
+
+def eval_piq(target_f, predict_f):
+    target_files = os.listdir(target_f)
+
+    psnr_list = []
+    ssim_list = []
+    for target_name in target_files:
+        pred_name = target_name.replace("golden", "output")
+        hr = np.fromfile(os.path.join(target_f, target_name), dtype=np.float32)
+        hr = hr.reshape((1, 448, 448, 3))
+        hr = np.moveaxis(hr, 3, 1)
+        hr = torch.from_numpy(hr)
+
+        sr = np.fromfile(os.path.join(predict_f, pred_name), dtype=np.float32)
+        sr = sr.reshape((1, 448, 448, 3))
+        sr = np.moveaxis(sr, 3, 1)
+        sr = torch.from_numpy(sr).clamp(0, 1)
+
+        psnr_list.append(piq.psnr(hr, sr))
+        ssim_list.append(piq.ssim(hr, sr))
+
+    avg_psnr = sum(psnr_list).item() / len(psnr_list)
+    avg_ssim = sum(ssim_list).item() / len(ssim_list)
+
+    print(f"Avg of PSNR is: {avg_psnr}")
+    print(f"Avg of SSIM is: {avg_ssim}")
+
+
+def eval_segmentation(target_f, predict_f):
+    classes = [
+        "Backround",
+        "Aeroplane",
+        "Bicycle",
+        "Bird",
+        "Boat",
+        "Bottle",
+        "Bus",
+        "Car",
+        "Cat",
+        "Chair",
+        "Cow",
+        "DiningTable",
+        "Dog",
+        "Horse",
+        "MotorBike",
+        "Person",
+        "PottedPlant",
+        "Sheep",
+        "Sofa",
+        "Train",
+        "TvMonitor",
+    ]
+
+    target_files = os.listdir(target_f)
+
+    def make_confusion(goldens, predictions, num_classes):
+        def histogram(golden, predict):
+            mask = golden < num_classes
+            hist = np.bincount(
+                num_classes * golden[mask].astype(int) + predict[mask],
+                minlength=num_classes**2,
+            ).reshape(num_classes, num_classes)
+            return hist
+
+        confusion = np.zeros((num_classes, num_classes))
+        for g, p in zip(goldens, predictions):
+            confusion += histogram(g.flatten(), p.flatten())
+
+        return confusion
+
+    pred_list = []
+    target_list = []
+    for target_name in target_files:
+        pred_name = target_name.replace("golden", "output")
+        target_npy = np.fromfile(os.path.join(target_f, target_name), dtype=np.uint8)
+        target_npy = target_npy.reshape((224, 224))
+        target_list.append(target_npy)
+
+        pred_npy = np.fromfile(os.path.join(predict_f, pred_name), dtype=np.float32)
+        pred_npy = pred_npy.reshape((224, 224, len(classes)))
+        pred_npy = pred_npy.argmax(2).astype(np.uint8)
+        pred_list.append(pred_npy)
+
+    eps = 1e-6
+    confusion = make_confusion(target_list, pred_list, len(classes))
+
+    pa = np.diag(confusion).sum() / (confusion.sum() + eps)
+    mpa = np.mean(np.diag(confusion) / (confusion.sum(axis=1) + eps))
+    iou = np.diag(confusion) / (
+        confusion.sum(axis=1) + confusion.sum(axis=0) - np.diag(confusion) + eps
+    )
+    miou = np.mean(iou)
+    cls_iou = dict(zip(classes, iou))
+
+    print(f"PA   : {pa}")
+    print(f"MPA  : {mpa}")
+    print(f"MIoU : {miou}")
+    print(f"CIoU : \n{json.dumps(cls_iou, indent=2)}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--target_f",
+        help="folder of target data",
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "--out_f",
+        help="folder of model prediction data",
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "--eval_type",
+        help="Choose eval type from: topk, piq, segmentation",
+        type=str,
+        choices=["topk", "piq", "segmentation"],
+        required=True,
+    )
+
+    args = parser.parse_args()
+
+    check_data(args.target_f, args.out_f)
+
+    if args.eval_type == "topk":
+        eval_topk(args.target_f, args.out_f)
+    elif args.eval_type == "piq":
+        eval_piq(args.target_f, args.out_f)
+    elif args.eval_type == "segmentation":
+        eval_segmentation(args.target_f, args.out_f)
diff --git a/examples/mediatek/executor_runner/llama_runner/CMakeLists.txt b/examples/mediatek/executor_runner/llama_runner/CMakeLists.txt
index fe809ef1337..9d27e685f3a 100644
--- a/examples/mediatek/executor_runner/llama_runner/CMakeLists.txt
+++ b/examples/mediatek/executor_runner/llama_runner/CMakeLists.txt
@@ -1,66 +1,41 @@
- # Copyright (c) 2024 MediaTek Inc.
- #
- # Licensed under the BSD License (the "License"); you may not use this file
- # except in compliance with the License. See the license file in the root
- # directory of this source tree for more details.
+# Copyright (c) 2024 MediaTek Inc.
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file in the root
+# directory of this source tree for more details.
 
 # Let include directory as "executorch/..."
 set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
-include_directories(
-    BEFORE
-    ${_common_include_directories}
-)
+include_directories(BEFORE ${_common_include_directories})
 
 # shortcut include directory for neuron headers
 include_directories(
-    BEFORE
-    ${_common_include_directories}/backends/mediatek/runtime/include
+  BEFORE ${_common_include_directories}/backends/mediatek/runtime/include
 )
 
 add_library(llm_helper STATIC)
-target_sources(llm_helper
-    PRIVATE
-    llm_helper/mask_builder.cpp
-    llm_helper/rotary_embedding.cpp
-    llm_helper/token_embedding.cpp
+target_sources(
+  llm_helper
+  PRIVATE llm_helper/mask_builder.cpp llm_helper/rotary_embedding.cpp
+          llm_helper/token_embedding.cpp
 )
 
-target_link_libraries(llm_helper
-    PRIVATE
-    executorch
-)
-target_include_directories(llm_helper
-    PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}
-    llm_helper/include
-)
-target_compile_options(llm_helper
-    PRIVATE
-    ${_common_compile_options}
+target_link_libraries(llm_helper PRIVATE executorch)
+target_include_directories(
+  llm_helper PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} llm_helper/include
 )
+target_compile_options(llm_helper PRIVATE ${_common_compile_options})
 
 add_library(mtk_llama_executor_lib STATIC)
-target_link_libraries(mtk_llama_executor_lib
-    PRIVATE
-    ${_executor_runner_libs}
-    ${NEURON_BUFFER_ALLOCATOR_LIB}
-    neuron_backend
-    log
-    llm_helper
-)
-target_sources(mtk_llama_executor_lib
-    INTERFACE
-    MultiModelLoader.h
-    ModelChunk.h
-    LlamaModelChunk.h
-    LlamaRuntime.h
-    PRIVATE
-    MultiModelLoader.cpp
-    ModelChunk.cpp
-    LlamaModelChunk.cpp
-    LlamaRuntime.cpp
-)
-target_compile_options(mtk_llama_executor_lib
-    PUBLIC
-    ${_common_compile_options}
-)
+target_link_libraries(
+  mtk_llama_executor_lib
+  PRIVATE ${_executor_runner_libs} ${NEURON_BUFFER_ALLOCATOR_LIB}
+          neuron_backend log llm_helper
+)
+target_sources(
+  mtk_llama_executor_lib
+  INTERFACE MultiModelLoader.h ModelChunk.h LlamaModelChunk.h LlamaRuntime.h
+  PRIVATE MultiModelLoader.cpp ModelChunk.cpp LlamaModelChunk.cpp
+          LlamaRuntime.cpp
+)
+target_compile_options(mtk_llama_executor_lib PUBLIC ${_common_compile_options})
diff --git a/examples/mediatek/executor_runner/llama_runner/FileMemMapper.h b/examples/mediatek/executor_runner/llama_runner/FileMemMapper.h
index 5fc09428db8..1382e14d3e1 100644
--- a/examples/mediatek/executor_runner/llama_runner/FileMemMapper.h
+++ b/examples/mediatek/executor_runner/llama_runner/FileMemMapper.h
@@ -15,7 +15,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
-namespace torch::executor {
+namespace example {
 
 class FileMemMapper { // Read-only mmap
  public:
@@ -97,4 +97,4 @@ class FileMemMapper { // Read-only mmap
   size_t mSize = 0;
 };
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaConfig.h b/examples/mediatek/executor_runner/llama_runner/LlamaConfig.h
index 5465299b32d..f512d59b5c5 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaConfig.h
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaConfig.h
@@ -13,7 +13,7 @@
 
 #include "llm_helper/include/llm_types.h"
 
-namespace torch::executor {
+namespace example {
 
 using llm_helper::LLMType;
 
@@ -42,4 +42,4 @@ struct LlamaModelPaths {
   std::vector<std::string> gen_model_paths;
 };
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
index 2096bdea62e..bf4b3eefdde 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
@@ -18,7 +18,6 @@
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/util/util.h>
 
 #include "LlamaConfig.h"
 #include "LlamaModelChunk.h"
@@ -27,7 +26,7 @@
 #include "llm_helper/include/mask_builder.h"
 #include "llm_helper/include/rotary_embedding.h"
 
-namespace torch::executor {
+namespace example {
 
 inline std::vector<size_t> getIndexRange(
     const size_t startIndex,
@@ -344,4 +343,4 @@ void LlamaModelChunk::InitCache() {
   }
 }
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h
index 8cba55b8468..0a5002199db 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h
@@ -19,7 +19,6 @@
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/util/util.h>
 
 #include "LlamaConfig.h"
 #include "ModelChunk.h"
@@ -28,12 +27,12 @@
 #include "llm_helper/include/mask_builder.h"
 #include "llm_helper/include/rotary_embedding.h"
 
-namespace torch::executor {
+namespace example {
 
 using llm_helper::MaskBuilder;
 using llm_helper::RotaryEmbeddingMasterLut;
 
-using TensorShape = Span<const int32_t>;
+using TensorShape = executorch::runtime::Span<const int32_t>;
 using ModelIndexMap = std::unordered_map<size_t, size_t>;
 
 // Llama decoder chunk
@@ -136,4 +135,4 @@ class LlamaModelChunk : public ModelChunk {
   size_t mCurrentTokenIndex = 0;
 };
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
index 0d2d5ccd59c..2254241a001 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
@@ -18,7 +18,7 @@
 #include "llm_helper/include/rotary_embedding.h"
 #include "llm_helper/include/token_embedding.h"
 
-namespace torch::executor {
+namespace example {
 
 void LlamaRuntime::Initialize(
     const LlamaModelOptions& modelOptions,
@@ -201,4 +201,4 @@ const LlamaModelOptions& LlamaRuntime::GetModelOptions() const {
   return mModelOptions;
 }
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.h b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.h
index d788e73dcaa..fc2fca2e105 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.h
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.h
@@ -20,7 +20,7 @@
 #include "llm_helper/include/rotary_embedding.h"
 #include "llm_helper/include/token_embedding.h"
 
-namespace torch::executor {
+namespace example {
 
 class LlamaRuntime {
  public:
@@ -56,4 +56,4 @@ class LlamaRuntime {
   size_t mTokenIndex = 0;
 };
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
index aebb6b9c0b7..7f67fb4ca79 100644
--- a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
@@ -19,14 +19,25 @@
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/util/util.h>
 
 #define ENSURE_INIT \
   ET_CHECK_MSG(Initialized(), "Error: Model chunk not initialized.");
 
-namespace torch::executor {
-
-using util::FileDataLoader;
+namespace example {
+
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::extension::FileDataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::Tag;
 
 static constexpr size_t kMethodAllocatorPoolSize = 4 * 1024U * 1024U; // 4MB
 
@@ -573,4 +584,4 @@ void ModelChunk::ReleaseModelInstance(void* modelInstance) {
   }
 }
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/ModelChunk.h b/examples/mediatek/executor_runner/llama_runner/ModelChunk.h
index 988747e47e3..67d9e30b5f1 100644
--- a/examples/mediatek/executor_runner/llama_runner/ModelChunk.h
+++ b/examples/mediatek/executor_runner/llama_runner/ModelChunk.h
@@ -16,7 +16,7 @@
 
 #include "MultiModelLoader.h"
 
-namespace torch::executor {
+namespace example {
 
 struct BufferInfo {
   void* data = nullptr;
@@ -91,7 +91,7 @@ class ModelChunk : protected MultiTokenSizeModelLoader {
   // Release allocated buffers for model IOs
   void ReleaseIoBuffers();
 
-  Method& GetModelMethod();
+  executorch::runtime::Method& GetModelMethod();
 
  private:
   // Override the virtual functions
@@ -119,4 +119,4 @@ class ModelChunk : protected MultiTokenSizeModelLoader {
   std::unordered_map<size_t, size_t> mModelOutToInIndexLinks;
 };
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
index e0479110a7c..4c33e174b74 100644
--- a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
@@ -16,7 +16,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace torch::executor {
+namespace example {
 
 template <typename IdType>
 void MultiModelLoader<IdType>::LoadModels() {
@@ -174,4 +174,4 @@ std::string MultiModelLoader<IdType>::GetIdString(const IdType& id) {
 template class MultiModelLoader<int>;
 template class MultiModelLoader<size_t>;
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.h b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.h
index 49a400f4477..7c364b60c03 100644
--- a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.h
+++ b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.h
@@ -12,7 +12,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace torch::executor {
+namespace example {
 
 template <typename IdType = size_t>
 class MultiModelLoader {
@@ -92,4 +92,4 @@ class MultiModelLoader {
   IdType mCurrentModelId = 0;
 };
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/Utils.h b/examples/mediatek/executor_runner/llama_runner/Utils.h
index 9aed7ec08d4..24e8a4d6e50 100644
--- a/examples/mediatek/executor_runner/llama_runner/Utils.h
+++ b/examples/mediatek/executor_runner/llama_runner/Utils.h
@@ -18,7 +18,7 @@
 #include <string_view>
 #include <vector>
 
-namespace torch::executor {
+namespace example {
 namespace utils {
 
 class Timer {
@@ -113,4 +113,4 @@ static std::string to_string(const std::vector<T> vec) {
 }
 
 } // namespace utils
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llm_types.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llm_types.h
index e4cb14a2c98..59290f820fc 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llm_types.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llm_types.h
@@ -11,7 +11,7 @@
 #include <stddef.h>
 #include <strings.h>
 
-namespace torch::executor {
+namespace example {
 namespace llm_helper {
 
 typedef enum { INT4, INT8, INT16, FP16, INT32, FP32, INVALID } LLMType;
@@ -72,4 +72,4 @@ inline const char* getLLMTypeName(const LLMType llm_type) {
 }
 
 } // namespace llm_helper
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/mask_builder.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/mask_builder.h
index 14b40619ad3..5ab6741c11c 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/mask_builder.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/mask_builder.h
@@ -12,7 +12,7 @@
 
 #include <string>
 
-namespace torch::executor {
+namespace example {
 namespace llm_helper {
 
 class MaskBuilder {
@@ -76,4 +76,4 @@ class MaskBuilder {
 };
 
 } // namespace llm_helper
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/rotary_embedding.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/rotary_embedding.h
index cef7ec09e2a..d4c017cf82b 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/rotary_embedding.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/rotary_embedding.h
@@ -13,7 +13,7 @@
 #include <string>
 #include <vector>
 
-namespace torch::executor {
+namespace example {
 namespace llm_helper {
 
 class RotaryEmbeddingMasterLut {
@@ -77,4 +77,4 @@ class RotaryEmbeddingMasterLut {
 };
 
 } // namespace llm_helper
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/token_embedding.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/token_embedding.h
index d3ed623f5f0..43d6413be66 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/token_embedding.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/token_embedding.h
@@ -13,7 +13,7 @@
 #include <string>
 #include <vector>
 
-namespace torch::executor {
+namespace example {
 
 class FileMemMapper;
 
@@ -49,4 +49,4 @@ class TokenEmbeddingLut {
 };
 
 } // namespace llm_helper
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp b/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp
index 9a7dafb2b7f..e83e8b37082 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp
@@ -11,7 +11,7 @@
 #include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/log.h>
 
-namespace torch::executor {
+namespace example {
 namespace llm_helper {
 
 // Define mask values for different types
@@ -260,4 +260,4 @@ bool MaskBuilder::adjustMaskForPadding(const size_t tokenBatchSize) {
 }
 
 } // namespace llm_helper
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp b/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp
index 9015d875495..6f1a64bedbc 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp
@@ -16,7 +16,7 @@
 #include <fstream>
 #include <type_traits>
 
-namespace torch::executor {
+namespace example {
 namespace llm_helper {
 
 RotaryEmbeddingMasterLut::RotaryEmbeddingMasterLut(
@@ -394,4 +394,4 @@ size_t RotaryEmbeddingMasterLut::getRotEmbedLength() const {
 }
 
 } // namespace llm_helper
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp b/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp
index 1e20cc22594..b69bb713083 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp
@@ -19,7 +19,7 @@
 
 namespace fs = std::filesystem;
 
-namespace torch::executor {
+namespace example {
 namespace llm_helper {
 
 TokenEmbeddingLut::TokenEmbeddingLut(
@@ -90,4 +90,4 @@ void TokenEmbeddingLut::lookupEmbedding(const std::vector<uint64_t>& tokens) {
 }
 
 } // namespace llm_helper
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/mtk_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_executor_runner.cpp
index a6ab4eedab2..1d9d5522161 100644
--- a/examples/mediatek/executor_runner/mtk_executor_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_executor_runner.cpp
@@ -41,11 +41,21 @@ DEFINE_string(
     "Model serialized in flatbuffer format.");
 DEFINE_int32(iteration, 1, "Iterations of inference.");
 
-using namespace torch::executor;
-using torch::executor::util::FileDataLoader;
+using executorch::extension::FileDataLoader;
+using executorch::extension::prepare_input_tensors;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
 
 int main(int argc, char** argv) {
-  runtime_init();
+  executorch::runtime::runtime_init();
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (argc != 1) {
@@ -158,7 +168,7 @@ int main(int argc, char** argv) {
   // Allocate input tensors and set all of their elements to 1. The `inputs`
   // variable owns the allocated memory and must live past the last call to
   // `execute()`.
-  auto inputs = util::prepare_input_tensors(*method);
+  auto inputs = prepare_input_tensors(*method);
   ET_CHECK_MSG(
       inputs.ok(),
       "Could not prepare inputs: 0x%" PRIx32,
@@ -196,7 +206,7 @@ int main(int argc, char** argv) {
   status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
   // Print the first and last 100 elements of long lists of scalars.
-  std::cout << torch::executor::util::evalue_edge_items(100);
+  std::cout << executorch::extension::evalue_edge_items(100);
   for (int i = 0; i < outputs.size(); ++i) {
     std::cout << "Output " << i << ": " << outputs[i] << std::endl;
   }
diff --git a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
index 59b7a39e1cc..2ebacec2c56 100644
--- a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
@@ -60,7 +60,6 @@
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/util/util.h>
 
 #include "llama_runner/LlamaConfig.h"
 #include "llama_runner/LlamaRuntime.h"
@@ -68,8 +67,9 @@
 #include "llama_runner/Utils.h"
 #include "llama_runner/llm_helper/include/llm_types.h"
 
-#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
-#include <executorch/examples/models/llama2/tokenizer/tiktoken.h>
+#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
+#include <executorch/extension/llm/tokenizer/tiktoken.h>
 
 // Llama model options
 DEFINE_uint64(
@@ -131,9 +131,19 @@ DEFINE_string(prompt_file, "", "File containing the prompt text.");
 static constexpr int8_t kAddBos = 1;
 static constexpr int8_t kAddEos = 0;
 
-using namespace torch::executor;
-using namespace torch::executor::llm_helper;
-using torch::executor::utils::Timer;
+using namespace example::llm_helper;
+using example::LlamaModelOptions;
+using example::LlamaModelPaths;
+using example::LlamaRuntime;
+using example::utils::argmax;
+using example::utils::read_file;
+using example::utils::split;
+using example::utils::Timer;
+using example::utils::to_string;
+using executorch::extension::llm::BPETokenizer;
+using executorch::extension::llm::Tokenizer;
+using executorch::runtime::Error;
+using executorch::runtime::Result;
 
 LlamaModelOptions get_model_options() {
   LlamaModelOptions options = {
@@ -159,8 +169,8 @@ LlamaModelPaths get_model_paths() {
   LlamaModelPaths model_paths = {
       .tokenizer_path = FLAGS_tokenizer_path,
       .token_embedding_path = FLAGS_token_embedding_path,
-      .prompt_model_paths = utils::split(FLAGS_prompt_model_paths, ','),
-      .gen_model_paths = utils::split(FLAGS_gen_model_paths, ',')};
+      .prompt_model_paths = split(FLAGS_prompt_model_paths, ','),
+      .gen_model_paths = split(FLAGS_gen_model_paths, ',')};
   return model_paths;
 }
 
@@ -211,8 +221,7 @@ Result<uint64_t> digest_prompt(
 
   const auto vocab_size = tokenizer->vocab_size();
   const auto logits_type = llama_runtime.GetModelOptions().model_output_type;
-  const auto first_output_token =
-      utils::argmax(logits_type, logits, vocab_size);
+  const auto first_output_token = argmax(logits_type, logits, vocab_size);
   return first_output_token;
 }
 
@@ -259,7 +268,7 @@ Error gen_response(
     timer_gen_token.End();
 
     prev_token = output_token;
-    output_token = utils::argmax(logits_type, logits, vocab_size);
+    output_token = argmax(logits_type, logits, vocab_size);
     full_response_tokens.push_back(output_token);
 
     // Stop when output is EOS
@@ -279,7 +288,7 @@ Error gen_response(
   }
 
   std::cout << "\n\n[Generated Tokens]\n"
-            << utils::to_string(full_response_tokens) << std::endl;
+            << to_string(full_response_tokens) << std::endl;
 
   ET_LOG(
       Info,
@@ -314,9 +323,9 @@ Error inference(
 std::unique_ptr<Tokenizer> load_tokenizer() {
   std::unique_ptr<Tokenizer> tokenizer;
   if (FLAGS_tokenizer_type == "bpe") {
-    tokenizer = std::make_unique<torch::executor::BPETokenizer>();
+    tokenizer = std::make_unique<BPETokenizer>();
   } else if (FLAGS_tokenizer_type == "tiktoken") {
-    tokenizer = std::make_unique<torch::executor::Tiktoken>();
+    tokenizer = example::get_tiktoken_for_llama();
   }
   ET_CHECK_MSG(
       tokenizer, "Invalid tokenizer type: %s", FLAGS_tokenizer_type.c_str());
@@ -325,7 +334,7 @@ std::unique_ptr<Tokenizer> load_tokenizer() {
 }
 
 int main(int argc, char** argv) {
-  runtime_init();
+  executorch::runtime::runtime_init();
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (argc != 1) {
@@ -364,7 +373,7 @@ int main(int argc, char** argv) {
 
   // Run model
   ET_CHECK_MSG(!FLAGS_prompt_file.empty(), "No prompt file provided.");
-  std::string prompt = utils::read_file(FLAGS_prompt_file);
+  std::string prompt = read_file(FLAGS_prompt_file);
   inference(llama_runtime, tokenizer, prompt);
 
   // Release model
diff --git a/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp
new file mode 100755
index 00000000000..bfa8aef38f0
--- /dev/null
+++ b/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 MediaTek Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ *
+ * This tool can run ExecuTorch model files that only use operators that
+ * are covered by the portable kernels, with possible delegate to the
+ * test_backend_compiler_lib.
+ *
+ * It sets all input tensor data to ones, and assumes that the outputs are
+ * all fp32 tensors.
+ */
+
+#include <cstdlib>
+#include <ctime>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <memory>
+
+#include <gflags/gflags.h>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/runner_util/inputs.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/runtime.h>
+
+static uint8_t method_allocator_pool[8 * 1024U * 1024U]; // 8 MB
+
+// Model Path
+DEFINE_string(
+    model_path,
+    "model.pte",
+    "Model serialized in flatbuffer format. Default to 'model.pte'");
+DEFINE_string(
+    input_list,
+    "input_list.txt",
+    "Model input list. Default to 'input_list.txt'");
+DEFINE_string(
+    output_folder,
+    "outputs",
+    "Model output folder. Default to 'outputs'");
+
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::extension::BufferCleanup;
+using executorch::extension::FileDataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::Tag;
+using executorch::runtime::TensorInfo;
+
+using namespace std::filesystem;
+
+int main(int argc, char** argv) {
+  executorch::runtime::runtime_init();
+
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (argc != 1) {
+    std::string msg = "Extra commandline args:";
+    for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
+      msg += std::string(" ") + argv[i];
+    }
+    ET_LOG(Error, "%s", msg.c_str());
+    return 1;
+  }
+
+  // Create output folder
+  create_directories(FLAGS_output_folder);
+
+  // Create a loader to get the data of the program file. There are other
+  // DataLoaders that use mmap() or point to data that's already in memory, and
+  // users can create their own DataLoaders to load from arbitrary sources.
+  const char* model_path = FLAGS_model_path.c_str();
+  Result<FileDataLoader> loader = FileDataLoader::from(model_path);
+  ET_CHECK_MSG(
+      loader.ok(),
+      "FileDataLoader::from() failed: 0x%" PRIx32,
+      (uint32_t)loader.error());
+
+  // Parse the program file. This is immutable, and can also be reused between
+  // multiple execution invocations across multiple threads.
+  Result<Program> program = Program::load(&loader.get());
+  if (!program.ok()) {
+    ET_LOG(Error, "Failed to parse model file %s", model_path);
+    return 1;
+  }
+  ET_LOG(Info, "Model file %s is loaded.", model_path);
+
+  // Use the first method in the program.
+  const char* method_name = nullptr;
+  {
+    const auto method_name_result = program->get_method_name(0);
+    ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
+    method_name = *method_name_result;
+  }
+  ET_LOG(Info, "Using method %s", method_name);
+
+  // MethodMeta describes the memory requirements of the method.
+  Result<MethodMeta> method_meta_result = program->method_meta(method_name);
+  ET_CHECK_MSG(
+      method_meta_result.ok(),
+      "Failed to get method_meta for %s: 0x%" PRIx32,
+      method_name,
+      (uint32_t)method_meta_result.error());
+
+  //
+  // The runtime does not use malloc/new; it allocates all memory using the
+  // MemoryManger provided by the client. Clients are responsible for allocating
+  // the memory ahead of time, or providing MemoryAllocator subclasses that can
+  // do it dynamically.
+  //
+
+  // The method allocator is used to allocate all dynamic C++ metadata/objects
+  // used to represent the loaded method. This allocator is only used during
+  // loading a method of the program, which will return an error if there was
+  // not enough memory.
+  //
+  // The amount of memory required depends on the loaded method and the runtime
+  // code itself. The amount of memory here is usually determined by running the
+  // method and seeing how much memory is actually used, though it's possible to
+  // subclass MemoryAllocator so that it calls malloc() under the hood (see
+  // MallocMemoryAllocator).
+  //
+  // In this example we use a statically allocated memory pool.
+  MemoryAllocator method_allocator{
+      MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
+
+  // The memory-planned buffers will back the mutable tensors used by the
+  // method. The sizes of these buffers were determined ahead of time during the
+  // memory-planning pasees.
+  //
+  // Each buffer typically corresponds to a different hardware memory bank. Most
+  // mobile environments will only have a single buffer. Some embedded
+  // environments may have more than one for, e.g., slow/large DRAM and
+  // fast/small SRAM, or for memory associated with particular cores.
+  std::vector<std::unique_ptr<uint8_t[]>> planned_buffers; // Owns the memory
+  std::vector<Span<uint8_t>> planned_spans; // Passed to the allocator
+  size_t num_memory_planned_buffers =
+      method_meta_result->num_memory_planned_buffers();
+  for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
+    // .get() will always succeed because id < num_memory_planned_buffers.
+    size_t buffer_size = static_cast<size_t>(
+        method_meta_result->memory_planned_buffer_size(id).get());
+    ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size);
+    planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
+    planned_spans.push_back({planned_buffers.back().get(), buffer_size});
+  }
+  HierarchicalAllocator planned_memory(
+      {planned_spans.data(), planned_spans.size()});
+
+  // Assemble all of the allocators into the MemoryManager that the Executor
+  // will use.
+  MemoryManager memory_manager(&method_allocator, &planned_memory);
+
+  //
+  // Load the method from the program, using the provided allocators. Running
+  // the method can mutate the memory-planned buffers, so the method should only
+  // be used by a single thread at at time, but it can be reused.
+  //
+  Result<Method> method = program->load_method(method_name, &memory_manager);
+  ET_CHECK_MSG(
+      method.ok(),
+      "Loading of method %s failed with status 0x%" PRIx32,
+      method_name,
+      (uint32_t)method.error());
+  ET_LOG(Info, "Method loaded.");
+
+  std::ifstream input_list(FLAGS_input_list);
+  ET_CHECK_MSG(
+      input_list.is_open(),
+      "Error: cannot open input file %s",
+      FLAGS_input_list.c_str());
+
+  auto split = [](std::string s, std::string delimiter) {
+    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+    std::string token;
+    std::vector<std::string> res;
+
+    while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) {
+      token = s.substr(pos_start, pos_end - pos_start);
+      pos_start = pos_end + delim_len;
+      res.push_back(token);
+    }
+    res.push_back(s.substr(pos_start));
+    return res;
+  };
+
+  MethodMeta method_meta = method->method_meta();
+  size_t num_inputs = method_meta.num_inputs();
+  std::string file_path;
+  int inference_index = 0;
+  while (std::getline(input_list, file_path)) {
+    auto input_files = split(file_path, " ");
+    if (input_files.size() == 0) {
+      break;
+    }
+    ET_CHECK_MSG(
+        input_files.size() == num_inputs,
+        "Model expect %zu inputs but get %zu from input files",
+        num_inputs,
+        input_files.size());
+
+    // Prepare the inputs.
+    size_t num_allocated = 0;
+    ET_LOG(Info, "Number of inputs: %zu", num_inputs);
+    void** inputs = (void**)malloc(num_inputs * sizeof(void*));
+
+    for (size_t i = 0; i < num_inputs; i++) {
+      auto tag = method_meta.input_tag(i);
+      if (tag.get() != Tag::Tensor) {
+        ET_LOG(Debug, "Skipping malloc non-tensor input %zu", i);
+        continue;
+      }
+      Result<TensorInfo> tensor_meta = method_meta.input_tensor_meta(i);
+      const auto nbytes = tensor_meta->nbytes();
+      // This input is a tensor. Allocate a buffer for it.
+      void* data_ptr = malloc(nbytes);
+
+      // Read data from file
+      std::ifstream fin(input_files[i], std::ios::binary);
+      fin.seekg(0, fin.end);
+      size_t file_size = fin.tellg();
+
+      ET_CHECK_MSG(
+          file_size == nbytes,
+          "Input %zu size mismatch. file bytes: %zu, tensor bytes: %zu",
+          i,
+          file_size,
+          nbytes);
+
+      fin.seekg(0, fin.beg);
+      fin.read(static_cast<char*>(data_ptr), file_size);
+      fin.close();
+      inputs[num_allocated++] = data_ptr;
+
+      // Set backend input
+      auto scalar_type = tensor_meta->scalar_type();
+      auto sizes_raw = tensor_meta->sizes();
+      auto dim = sizes_raw.size();
+      auto dim_order_raw = tensor_meta->dim_order();
+      std::vector sizes(sizes_raw.begin(), sizes_raw.end());
+      std::vector dim_order(dim_order_raw.begin(), dim_order_raw.end());
+
+      TensorImpl impl = TensorImpl(
+          scalar_type, dim, sizes.data(), data_ptr, dim_order.data());
+
+      Tensor tensor(&impl);
+      Error ret = method->set_input(tensor, i);
+      if (ret != Error::Ok) {
+        ET_LOG(Error, "Failed to set input %zu: 0x%" PRIx32, i, (uint32_t)ret);
+        // The BufferCleanup will free the inputs when it goes out of scope.
+        BufferCleanup cleanup({inputs, num_allocated});
+        return 1;
+      }
+    }
+    BufferCleanup({inputs, num_allocated});
+    ET_LOG(Info, "Inputs prepared.");
+
+    // Run the model.
+    auto before_exec = std::chrono::high_resolution_clock::now();
+    Error status = Error::Ok;
+    status = method->execute();
+    auto after_exec = std::chrono::high_resolution_clock::now();
+    double elapsed_time = std::chrono::duration_cast<std::chrono::microseconds>(
+                              after_exec - before_exec)
+                              .count() /
+        1000.0;
+
+    ET_LOG(Info, "Inference took %f ms", elapsed_time);
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "Execution of method %s failed with status 0x%" PRIx32,
+        method_name,
+        (uint32_t)status);
+    ET_LOG(Info, "Model executed successfully.");
+
+    // Get output data
+    size_t output_size = method->outputs_size();
+    ET_LOG(Info, "Number of outputs: %zu", output_size);
+    std::vector<EValue> outputs(output_size);
+    status = method->get_outputs(outputs.data(), output_size);
+    ET_CHECK(status == Error::Ok);
+    for (size_t i = 0; i < output_size; i++) {
+      auto output_tensor = outputs[i].toTensor();
+      auto output_file_name = FLAGS_output_folder + "/output_" +
+          std::to_string(inference_index) + "_" + std::to_string(i) + ".bin";
+      std::ofstream fout(output_file_name.c_str(), std::ios::binary);
+      fout.write(output_tensor.const_data_ptr<char>(), output_tensor.nbytes());
+      fout.close();
+    }
+
+    inference_index++;
+  }
+
+  return 0;
+}
diff --git a/examples/mediatek/model_export_scripts/deeplab_v3.py b/examples/mediatek/model_export_scripts/deeplab_v3.py
new file mode 100755
index 00000000000..da6766c0f54
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/deeplab_v3.py
@@ -0,0 +1,124 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import random
+
+import numpy as np
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet101Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.deeplabv3 = DeepLabV3ResNet101Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        nchw_output = self.deeplabv3(nchw_input1)
+        return nchw_output.permute(0, 2, 3, 1)
+
+
+def get_dataset(data_size, dataset_dir, download):
+    from torchvision import datasets, transforms
+
+    input_size = (224, 224)
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(input_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    dataset = list(
+        datasets.VOCSegmentation(
+            root=os.path.join(dataset_dir, "voc_image"),
+            year="2009",
+            image_set="val",
+            transform=preprocess,
+            download=download,
+        )
+    )
+
+    # prepare input data
+    random.shuffle(dataset)
+    inputs, targets, input_list = [], [], ""
+    for index, data in enumerate(dataset):
+        if index >= data_size:
+            break
+        image, target = data
+        inputs.append((image.unsqueeze(0).permute(0, 2, 3, 1),))
+        targets.append(np.array(target.resize(input_size)))
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./deeplab_v3",
+        default="./deeplab_v3",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--download",
+        help="If specified, download VOCSegmentation dataset by torchvision API",
+        action="store_true",
+        default=False,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        data_size=data_num, dataset_dir=args.artifact, download=args.download
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+            if idx == 0:
+                print("inp shape: ", d.detach().numpy().shape)
+                print("inp type: ", d.detach().numpy().dtype)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.tofile(file_name)
+        if idx == 0:
+            print("golden shape: ", data.shape)
+            print("golden type: ", data.dtype)
+
+    # build pte
+    pte_filename = "deeplabV3Resnet101_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 224, 224, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/edsr.py b/examples/mediatek/model_export_scripts/edsr.py
new file mode 100755
index 00000000000..4192d67e569
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/edsr.py
@@ -0,0 +1,170 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import numpy as np
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.edsr import EdsrModel
+
+from PIL import Image
+from torch.utils.data import Dataset
+from torchsr.datasets import B100
+from torchvision.transforms.functional import to_tensor
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.edsr = EdsrModel().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        nchw_output = self.edsr(nchw_input1)
+        return nchw_output.permute(0, 2, 3, 1)
+
+
+class SrDataset(Dataset):
+    def __init__(self, hr_dir: str, lr_dir: str):
+        self.input_size = np.asanyarray([224, 224])
+        self.hr = []
+        self.lr = []
+
+        for file in sorted(os.listdir(hr_dir)):
+            self.hr.append(self._resize_img(os.path.join(hr_dir, file), 2))
+
+        for file in sorted(os.listdir(lr_dir)):
+            self.lr.append(self._resize_img(os.path.join(lr_dir, file), 1))
+
+        if len(self.hr) != len(self.lr):
+            raise AssertionError(
+                "The number of high resolution pics is not equal to low "
+                "resolution pics"
+            )
+
+    def __getitem__(self, idx: int):
+        return self.hr[idx], self.lr[idx]
+
+    def __len__(self):
+        return len(self.lr)
+
+    def _resize_img(self, file: str, scale: int):
+        with Image.open(file) as img:
+            return (
+                to_tensor(img.resize(tuple(self.input_size * scale)))
+                .unsqueeze(0)
+                .permute(0, 2, 3, 1)
+            )
+
+    def get_input_list(self):
+        input_list = ""
+        for i in range(len(self.lr)):
+            input_list += f"input_{i}_0.bin\n"
+        return input_list
+
+
+def get_b100(
+    dataset_dir: str,
+):
+    hr_dir = f"{dataset_dir}/sr_bm_dataset/SRBenchmarks/benchmark/B100/HR"
+    lr_dir = f"{dataset_dir}/sr_bm_dataset/SRBenchmarks/benchmark/B100/LR_bicubic/X2"
+
+    if not os.path.exists(hr_dir) or not os.path.exists(lr_dir):
+        B100(root=f"{dataset_dir}/sr_bm_dataset", scale=2, download=True)
+
+    return SrDataset(hr_dir, lr_dir)
+
+
+def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str):
+    if not (lr_dir and hr_dir) and not default_dataset:
+        raise RuntimeError(
+            "Nither custom dataset is provided nor using default dataset."
+        )
+
+    if (lr_dir and hr_dir) and default_dataset:
+        raise RuntimeError("Either use custom dataset, or use default dataset.")
+
+    if default_dataset:
+        return get_b100(dataset_dir)
+
+    return SrDataset(hr_dir, lr_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./edsr",
+        default="./edsr",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-r",
+        "--hr_ref_dir",
+        help="Path to the high resolution images",
+        default="",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-l",
+        "--lr_dir",
+        help="Path to the low resolution image inputs",
+        default="",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--default_dataset",
+        help="If specified, download and use B100 dataset by torchSR API",
+        action="store_true",
+        default=False,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    dataset = get_dataset(
+        args.hr_ref_dir, args.lr_dir, args.default_dataset, args.artifact
+    )
+
+    inputs, targets, input_list = dataset.lr, dataset.hr, dataset.get_input_list()
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.detach().numpy().tofile(file_name)
+
+    # build pte
+    pte_filename = "edsr_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (inputs[0],),
+        f"{args.artifact}/{pte_filename}",
+        [(input,) for input in inputs],
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/inception_v3.py b/examples/mediatek/model_export_scripts/inception_v3.py
new file mode 100755
index 00000000000..c28bd85b402
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/inception_v3.py
@@ -0,0 +1,120 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.inception_v3 import InceptionV3Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.inception = InceptionV3Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        output = self.inception(nchw_input1)
+        return output
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        feature = feature.permute(0, 2, 3, 1)  # NHWC
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./inceptionV3",
+        default="./inceptionV3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.detach().numpy().tofile(file_name)
+
+    pte_filename = "inceptionV3_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 224, 224, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/inception_v4.py b/examples/mediatek/model_export_scripts/inception_v4.py
new file mode 100755
index 00000000000..ccb2ce16f22
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/inception_v4.py
@@ -0,0 +1,120 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.inception_v4 import InceptionV4Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.inception = InceptionV4Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        output = self.inception(nchw_input1)
+        return output
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize((299, 299)),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        feature = feature.permute(0, 2, 3, 1)  # NHWC
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./inceptionV4",
+        default="./inceptionV4",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.detach().numpy().tofile(file_name)
+
+    # build pte
+    pte_filename = "inceptionV4_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 299, 299, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/llama.py b/examples/mediatek/model_export_scripts/llama.py
index 9c371f46ca6..b2fef26a4cf 100644
--- a/examples/mediatek/model_export_scripts/llama.py
+++ b/examples/mediatek/model_export_scripts/llama.py
@@ -365,11 +365,9 @@ def export_to_et_ir(
         executorch_program = delegated_program.to_executorch(
             config=exir.ExecutorchBackendConfig(
                 memory_planning_pass=exir.passes.MemoryPlanningPass(
-                    memory_planning_algo="greedy",
                     alloc_graph_input=False,
                     alloc_graph_output=False,
                 ),
-                extract_constant_segment=True,
                 extract_delegate_segments=True,
             )
         )
diff --git a/examples/mediatek/model_export_scripts/mobilenet_v2.py b/examples/mediatek/model_export_scripts/mobilenet_v2.py
new file mode 100755
index 00000000000..97f2ed884eb
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/mobilenet_v2.py
@@ -0,0 +1,121 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.mobilenet_v2 import MV2Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.mobilenet = MV2Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        output = self.mobilenet(nchw_input1)
+        return output
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        feature = feature.permute(0, 2, 3, 1)  # NHWC
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./mobilenetV2",
+        default="./mobilenetV2",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.detach().numpy().tofile(file_name)
+
+    # build pte
+    pte_filename = "mobilenetV2_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 224, 224, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/mobilenet_v3.py b/examples/mediatek/model_export_scripts/mobilenet_v3.py
new file mode 100755
index 00000000000..fed2497ca26
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/mobilenet_v3.py
@@ -0,0 +1,121 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.mobilenet_v3 import MV3Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.mobilenet = MV3Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        output = self.mobilenet(nchw_input1)
+        return output
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        feature = feature.permute(0, 2, 3, 1)  # NHWC
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./mobilenetV3",
+        default="./mobilenetV3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.detach().numpy().tofile(file_name)
+
+    # build pte
+    pte_filename = "mobilenetV3_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 224, 224, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/resnet18.py b/examples/mediatek/model_export_scripts/resnet18.py
new file mode 100755
index 00000000000..2f3af57e7f3
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/resnet18.py
@@ -0,0 +1,122 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.resnet import ResNet18Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.resnet = ResNet18Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        output = self.resnet(nchw_input1)
+        return output
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        feature = feature.permute(0, 2, 3, 1)  # NHWC
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./resnet18",
+        default="./resnet18",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        aaa = data.detach().numpy()
+        data.detach().numpy().tofile(file_name)
+
+    # build pte
+    pte_filename = "resnet18_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 224, 224, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/resnet50.py b/examples/mediatek/model_export_scripts/resnet50.py
new file mode 100755
index 00000000000..ce23842447b
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/resnet50.py
@@ -0,0 +1,121 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.resnet import ResNet50Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.resnet = ResNet50Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        output = self.resnet(nchw_input1)
+        return output
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        feature = feature.permute(0, 2, 3, 1)  # NHWC
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./resnet50",
+        default="./resnet50",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.detach().numpy().tofile(file_name)
+
+    # compile to pte
+    pte_filename = "resnet50_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 224, 224, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/requirements.txt b/examples/mediatek/requirements.txt
index 038700059ba..7c3de886e27 100644
--- a/examples/mediatek/requirements.txt
+++ b/examples/mediatek/requirements.txt
@@ -4,3 +4,5 @@ safetensors
 sentencepiece
 tokenizers
 transformers
+piq
+pillow
diff --git a/examples/mediatek/shell_scripts/export_oss.sh b/examples/mediatek/shell_scripts/export_oss.sh
new file mode 100755
index 00000000000..3da5dc41f94
--- /dev/null
+++ b/examples/mediatek/shell_scripts/export_oss.sh
@@ -0,0 +1,29 @@
+model=$1
+
+echo "Export model: $model"
+
+if [ $model = "deeplabv3" ]
+then
+	python3 model_export_scripts/deeplab_v3.py -d
+elif [ $model = "edsr" ]
+then
+	python3 model_export_scripts/edsr.py -d
+elif [ $model = "inceptionv3" ]
+then
+	python3 model_export_scripts/inception_v3.py -d PATH_TO_DATASET
+elif [ $model = "inceptionv4" ]
+then
+	python3 model_export_scripts/inception_v4.py -d PATH_TO_DATASET
+elif [ $model = "mobilenetv2" ]
+then
+	python3 model_export_scripts/mobilenet_v2.py -d PATH_TO_DATASET
+elif [ $model = "mobilenetv3" ]
+then
+	python3 model_export_scripts/mobilenet_v3.py -d PATH_TO_DATASET
+elif [ $model = "resnet18" ]
+then
+	python3 model_export_scripts/resnet18.py -d PATH_TO_DATASET
+elif [ $model = "resnet50" ]
+then
+	python3 model_export_scripts/resnet50.py -d PATH_TO_DATASET
+fi
diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.cpp b/examples/models/flamingo/cross_attention/cross_attention_mask.cpp
new file mode 100644
index 00000000000..246c086a685
--- /dev/null
+++ b/examples/models/flamingo/cross_attention/cross_attention_mask.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/models/flamingo/cross_attention/cross_attention_mask.h>
+
+#include <algorithm>
+#include <string>
+
+namespace example {
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::aten::TensorImpl;
+
+// Fowrward declaration needed for ARM compilers.
+int32_t safe_size_t_to_sizes_type(size_t value);
+std::vector<std::vector<int>> _get_image_attention_intervals(
+    const std::vector<int>& tokens,
+    int image_token_id);
+
+int32_t safe_size_t_to_sizes_type(size_t value) {
+  if (value >
+      static_cast<size_t>(std::numeric_limits<TensorImpl::SizesType>::max())) {
+    throw std::overflow_error(
+        "size_t value too large for TensorImpl::SizesType");
+  }
+  return static_cast<TensorImpl::SizesType>(value);
+}
+
+/**
+ * Returns a list of lists of the form [start, end) where start is the index
+ * of the current image token and end is the index of the next image token,
+ * exclusive.
+ *
+ * Example:
+ *     >>> text = "<img1><img2>These are two dogs. <img3>This is a cat."
+ *     >>> size_t image_token_id = 1;
+ *     >>> std::vector<int> tokens = {1, 1, 9673, 527, 1403, 12875, 13, 1, 1115,
+ * 374, 264, 8415]};
+ *     >>> transform = VisionCrossAttentionMask(tile_size=400, patch_size=40,
+ * image_token_id=1)
+ *     >>> intervals = _get_image_attention_intervals(tokens, image_token_id)
+ *     [[0, 7], [1, 7], [7, 12]]
+ *
+ * @param tokens List of token IDs in the text sequence.
+ * @param image_token_id The value of the image token.
+ *
+ * @returns Vector of vectors of the form [start, end) indicating the range of
+ * positions in the text sequence that should attend to the image.
+ */
+std::vector<std::vector<int>> _get_image_attention_intervals(
+    const std::vector<int>& tokens,
+    int image_token_id) {
+  std::vector<std::vector<int>> vision_masks;
+  int end = tokens.size();
+  std::vector<int> vision_token_locations;
+
+  // Find all vision token locations.
+  for (int i = 0; i < tokens.size(); ++i) {
+    if (tokens[i] == image_token_id) {
+      vision_token_locations.push_back(i);
+    }
+  }
+
+  // Return empty vector if there are no images.
+  if (vision_token_locations.empty()) {
+    return vision_masks;
+  }
+
+  // If there is only one image, it will attend to subsequent text until end.
+  if (vision_token_locations.size() == 1) {
+    vision_masks.push_back({vision_token_locations[0], end});
+    return vision_masks;
+  }
+
+  // Construct intervals from previous image token to next image token.
+  for (int i = 0; i < vision_token_locations.size() - 1; ++i) {
+    vision_masks.push_back(
+        {vision_token_locations[i], vision_token_locations[i + 1]});
+  }
+
+  // Last image will attend to subsequent text until end.
+  vision_masks.push_back({vision_token_locations.back(), end});
+
+  // If there are consecutive vision tokens, they should all attend to the
+  // same subsequent text.
+  int last_mask_end = vision_masks.back()[1];
+  for (auto it = vision_masks.rbegin(); it != vision_masks.rend(); ++it) {
+    if ((*it)[0] == (*it)[1] - 1) {
+      (*it)[1] = last_mask_end;
+    }
+    last_mask_end = (*it)[1];
+  }
+
+  return vision_masks;
+}
+
+std::vector<executorch::extension::TensorPtr> cross_attention_mask(
+    const std::vector<int>& tokens,
+    const std::vector<Tensor>& images,
+    size_t tile_size,
+    size_t patch_size,
+    int image_token_id,
+    std::vector<std::vector<int>>& out) {
+  size_t patch_grid_size = tile_size / patch_size;
+  size_t patches_per_tile = patch_grid_size * patch_grid_size;
+
+  std::vector<std::vector<int>> image_intervals =
+      _get_image_attention_intervals(tokens, image_token_id);
+
+  if (image_intervals.size() != images.size()) {
+    throw std::runtime_error(
+        "The number of image tokens (" +
+        std::to_string(image_intervals.size()) +
+        ") does not match the number of images (" +
+        std::to_string(images.size()) + ")");
+  }
+
+  // Create mask for each individual image based on its number of tokens,
+  // which can vary based on number of tiles since they are not yet tile padded.
+  // The masks are padded and concatenated together in the batch collator.
+  std::vector<executorch::extension::TensorPtr> cross_attention_masks;
+  size_t text_seq_len = tokens.size();
+  for (size_t image_idx = 0; image_idx < image_intervals.size(); ++image_idx) {
+    size_t n_tiles = images[image_idx].size(0);
+    size_t image_seq_len =
+        n_tiles * (patches_per_tile + 1); // +1 for the CLS token.
+
+    // Mask will be block of 1s at the corresponding interval in the text.
+    // It is not a causal block because all the image tokens correspond
+    // to a single image, so text tokens attend to all the image's tokens.
+    std::vector<TensorImpl::SizesType> sizes = {
+        safe_size_t_to_sizes_type(text_seq_len),
+        safe_size_t_to_sizes_type(image_seq_len)};
+
+    // Allocate the underlying data to be handled by the managed tensor.
+    size_t num_elements = text_seq_len * image_seq_len;
+    size_t stride = image_seq_len;
+    std::vector<int> mask_data(num_elements);
+
+    auto mask = executorch::extension::from_blob(
+        mask_data.data(), sizes, ScalarType::Int);
+    cross_attention_masks.emplace_back(std::move(mask));
+
+    // Add the allocated data to the output vector.
+    out.emplace_back(std::move(mask_data));
+
+    // All rows of tensor in the text_seq_len dimension within the interval are
+    // set to 1 (true).
+    size_t start = image_intervals[image_idx][0];
+    size_t end = image_intervals[image_idx][1]; // End is exclusive.
+    for (size_t i = start; i < end; ++i) {
+      for (size_t j = 0; j < image_seq_len; ++j) {
+        size_t unrolled_index = i * image_seq_len + j;
+        if (unrolled_index >= out[image_idx].size()) {
+          throw std::out_of_range(
+              "Index " + std::to_string(unrolled_index) +
+              " out of range of output tensor.");
+        }
+        out[image_idx][i * stride + j] = 1;
+      }
+    }
+  }
+
+  return cross_attention_masks;
+}
+
+} // namespace example
diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.h b/examples/models/flamingo/cross_attention/cross_attention_mask.h
new file mode 100644
index 00000000000..ae6df0a6be4
--- /dev/null
+++ b/examples/models/flamingo/cross_attention/cross_attention_mask.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+namespace example {
+
+/**
+ * Computes the cross-attention mask for text + image inputs. Text tokens that
+ * participate in cross-attention with an image token will show True in the mask
+ * and follow the interleaved structure laid out in Fig. 7 of the Flamingo paper
+ * (https://arxiv.org/pdf/2204.14198):
+ *
+ *     (1) Text tokens immediately following the image token up until the next
+ * image token (2) Consecutive image tokens attend to subsequent text tokens
+ *
+ * ::
+ *
+ *           ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐
+ *      img1 │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │   │ │   │ │   │ │   │ │   │
+ *           └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘
+ *           ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐
+ *      img2 │   │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │   │ │   │ │   │ │   │ │   │
+ *           └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘
+ *           ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐
+ *      img3 │   │ │   │ │   │ │   │ │   │ │   │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │
+ *           └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘
+ *         <img1> <img2>These  are   two  dogs. <img3> This   is    a    cat.
+ *
+ *
+ *
+ * Resultant mask is constructed per image and is of shape (text_seq_len,
+ * image_seq_len), where True indicates that the token outputted from the image
+ * encoder attends to the token in the text sequence in cross-attention. A list
+ * of these masks are returned with length equal to number of images in the
+ * sample.
+ *
+ * @param tokens Vector of tokens participating in the cross attention.
+ * @param images Vector of images participating in the cross attention.
+ * @param tile_size The size of the image tiles from the image transform.
+ * @param patch_size The size of each patch. Used to divide the tiles into
+ * patches. E.g. for patch_size = 40, a tile of shape (400, 400) will have 10x10
+ * grid of patches with shape (40, 40) each. image_token_id (int): Token ID of
+ * the image special token.
+ * @param image_token_id The value of the image token.
+ * @param out Out vector holding the raw data wrapped by the returned cross
+ * attention masks.
+ *
+ * @returns A vector of cross attention masks, as Tensors, one for each image.
+ */
+std::vector<::executorch::extension::TensorPtr> cross_attention_mask(
+    const std::vector<int>& tokens,
+    const std::vector<::executorch::aten::Tensor>& images,
+    size_t tile_size,
+    size_t patch_size,
+    int image_token_id,
+    std::vector<std::vector<int>>& out);
+
+} // namespace example
diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp b/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp
new file mode 100644
index 00000000000..baaa402d1fa
--- /dev/null
+++ b/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/models/flamingo/cross_attention/cross_attention_mask.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using exec_aten::TensorImpl;
+
+TEST(CrossAttentxnMaskTest, TestCrossAttentionMask) {
+  std::vector<int> tokens = {
+      1, 1, 9673, 527, 1403, 12875, 13, 1, 1115, 374, 264, 8415};
+
+  // Initialize image tensors.
+  TensorImpl::SizesType sizes[2] = {2, 2};
+  TensorImpl::DimOrderType dim_order[2] = {0, 1};
+  TensorImpl::StridesType strides[2] = {2, 1};
+
+  int32_t a_data[4] = {1, 2, 3, 4};
+  auto a_impl =
+      TensorImpl(ScalarType::Int, 2, sizes, a_data, dim_order, strides);
+  Tensor a(&a_impl);
+
+  int32_t b_data[4] = {5, 6, 7, 8};
+  auto b_impl =
+      TensorImpl(ScalarType::Int, 2, sizes, b_data, dim_order, strides);
+  Tensor b(&b_impl);
+
+  int32_t c_data[4] = {9, 10, 11, 12};
+  auto c_impl =
+      TensorImpl(ScalarType::Int, 2, sizes, c_data, dim_order, strides);
+  Tensor c(&c_impl);
+
+  std::vector<Tensor> images = {a, b, c};
+  std::vector<std::vector<int>> mask_data;
+  auto output_masks = example::cross_attention_mask(
+      tokens,
+      images,
+      /*tile_size=*/1,
+      /*patch_size=*/1,
+      /*image_token_id=*/1,
+      /*out=*/mask_data);
+
+  // Check contents of the mask.
+  std::vector<std::vector<size_t>> expected_intervals = {
+      {0, 7}, {1, 7}, {7, 12}};
+  for (size_t mask_idx = 0; mask_idx < output_masks.size(); ++mask_idx) {
+    auto& output_tensor = output_masks[mask_idx];
+    for (size_t i = 0; i < output_tensor->size(0); ++i) {
+      for (size_t j = 0; j < output_tensor->strides()[0]; ++j) {
+        size_t unrolled_index = i * output_tensor->strides()[0] + j;
+        if (i >= expected_intervals[mask_idx][0] &&
+            i < expected_intervals[mask_idx][1]) {
+          EXPECT_EQ(output_tensor->const_data_ptr<int>()[unrolled_index], 1);
+        } else {
+          EXPECT_EQ(output_tensor->const_data_ptr<int>()[unrolled_index], 0);
+        }
+      }
+    }
+  }
+}
diff --git a/examples/models/flamingo/cross_attention/targets.bzl b/examples/models/flamingo/cross_attention/targets.bzl
new file mode 100644
index 00000000000..c3d9da01561
--- /dev/null
+++ b/examples/models/flamingo/cross_attention/targets.bzl
@@ -0,0 +1,25 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.cxx_library(
+        name = "cross_attention_mask",
+        srcs = ["cross_attention_mask.cpp"],
+        exported_headers = ["cross_attention_mask.h"],
+        exported_deps = [
+            "//executorch/extension/tensor:tensor",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/util:tensor_util",
+        ],
+    )
+
+    runtime.cxx_test(
+        name = "cross_attention_mask_test",
+        srcs = ["cross_attention_mask_test.cpp"],
+        deps = [":cross_attention_mask"],
+    )
diff --git a/examples/models/flamingo/export_preprocess.py b/examples/models/flamingo/export_preprocess.py
deleted file mode 100644
index c5a930c88c8..00000000000
--- a/examples/models/flamingo/export_preprocess.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from export_preprocess_lib import export_preprocess, lower_to_executorch_preprocess
-
-
-def main():
-    ep = export_preprocess()
-    et = lower_to_executorch_preprocess(ep)
-
-    with open("preprocess.pte", "wb") as file:
-        et.write_to_file(file)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/models/flamingo/install_requirements.sh b/examples/models/flamingo/install_requirements.sh
index 0bcf302ca9e..8e1ebbd4576 100644
--- a/examples/models/flamingo/install_requirements.sh
+++ b/examples/models/flamingo/install_requirements.sh
@@ -7,3 +7,7 @@
 
 # Install torchtune nightly for model definitions.
 pip install --pre torchtune --extra-index-url https://download.pytorch.org/whl/nightly/cpu --no-cache-dir
+
+# Install torchao.
+TORCHAO_VERSION=$(cat "$(dirname "$0")"/../../../.ci/docker/ci_commit_pins/torchao.txt)
+pip install --no-use-pep517 --user "git+https://github.com/pytorch/ao.git@${TORCHAO_VERSION}"
diff --git a/examples/models/flamingo/passes/replace_custom_ops_with_aten_ops_pass.py b/examples/models/flamingo/passes/replace_custom_ops_with_aten_ops_pass.py
deleted file mode 100644
index 8c31cf512ce..00000000000
--- a/examples/models/flamingo/passes/replace_custom_ops_with_aten_ops_pass.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-import torch
-from executorch.exir.pass_base import ExportPass
-from executorch.extension.llm.custom_ops import preprocess_custom_ops  # noqa
-
-
-class ReplaceCustomOpsWithAtenOpsPass(ExportPass):
-    """
-    Goes through all ops and replaces custom ops with aten ops. In some cases
-    aten ops cannot be exported due to dynamism, eg. pad in flamingo preprocess.
-    Use a custom op to pass export, and replace it with the aten op post-export,
-    which avoids re-writing the op in C++.
-    """
-
-    def __init__(self) -> None:
-        super().__init__()
-
-    def call_operator(self, op, args, kwargs, meta):
-        if op._name == "preprocess::pad":
-            return super().call_operator(
-                torch.ops.aten.constant_pad_nd.default, args, kwargs, meta
-            )
-
-        return super().call_operator(op, args, kwargs, meta)
diff --git a/examples/models/flamingo/passes/test_passes.py b/examples/models/flamingo/passes/test_passes.py
deleted file mode 100644
index d0a90f2e347..00000000000
--- a/examples/models/flamingo/passes/test_passes.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-import unittest
-
-from typing import List
-
-import torch
-from executorch.exir import EdgeCompileConfig, to_edge
-
-from .replace_custom_ops_with_aten_ops_pass import ReplaceCustomOpsWithAtenOpsPass
-
-
-class TestPasses(unittest.TestCase):
-    def test_replace_custom_ops_with_aten_ops_pass(self) -> None:
-        from executorch.extension.llm.custom_ops import preprocess_custom_ops  # noqa
-
-        class Pad(torch.nn.Module):
-            def forward(self, x: torch.Tensor, padding: List[int]) -> torch.Tensor:
-                return torch.ops.preprocess.pad.default(x, padding)
-
-        pad = Pad()
-
-        image_tensor = torch.ones([3, 4, 5])
-        padding = [0, 2, 0, 1]
-
-        edge_prog = to_edge(
-            torch.export.export(pad, (image_tensor, padding), strict=False),
-            compile_config=EdgeCompileConfig(_check_ir_validity=False),
-        )
-
-        # Check that the custom op exists in the graph, and aten op does not.
-        edge_nodes = [node.name for node in edge_prog.exported_program().graph.nodes]
-        assert "constant_pad_nd" not in edge_nodes
-        assert "preprocess_pad_default" in edge_nodes
-
-        edge_prog = edge_prog.transform([ReplaceCustomOpsWithAtenOpsPass()])
-
-        # After running replace_custom_ops_with_aten_ops pass, the custom op
-        # should be replaced with aten op.
-        post_transform_nodes = [
-            node.name for node in edge_prog.exported_program().graph.nodes
-        ]
-        assert "constant_pad_nd" in post_transform_nodes
-        assert "preprocess_pad_default" not in post_transform_nodes
diff --git a/examples/models/flamingo/preprocess/__init__.py b/examples/models/flamingo/preprocess/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/examples/models/flamingo/preprocess/export_preprocess.py b/examples/models/flamingo/preprocess/export_preprocess.py
new file mode 100644
index 00000000000..7450a243a80
--- /dev/null
+++ b/examples/models/flamingo/preprocess/export_preprocess.py
@@ -0,0 +1,33 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.examples.models.flamingo.preprocess.export_preprocess_lib import (
+    export_preprocess,
+    get_example_inputs,
+    lower_to_executorch_preprocess,
+)
+
+
+def main():
+    # Export
+    ep = export_preprocess()
+
+    # ExecuTorch
+    et = lower_to_executorch_preprocess(ep)
+    with open("preprocess_et.pte", "wb") as file:
+        et.write_to_file(file)
+
+    # AOTInductor
+    torch._inductor.aot_compile(
+        ep.module(),
+        get_example_inputs(),
+        options={"aot_inductor.output_path": "preprocess_aoti.so"},
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/flamingo/export_preprocess_lib.py b/examples/models/flamingo/preprocess/export_preprocess_lib.py
similarity index 81%
rename from examples/models/flamingo/export_preprocess_lib.py
rename to examples/models/flamingo/preprocess/export_preprocess_lib.py
index 736116de8b7..53bb2e400d0 100644
--- a/examples/models/flamingo/export_preprocess_lib.py
+++ b/examples/models/flamingo/preprocess/export_preprocess_lib.py
@@ -8,16 +8,13 @@
 
 import torch
 from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge
+from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 from executorch.exir.program._program import ExecutorchProgramManager
 
-from executorch.extension.llm.custom_ops import preprocess_custom_ops  # noqa
+from executorch.extension.llm.custom_ops import op_tile_crop_aot  # noqa
 
 from torch.export import Dim, ExportedProgram
-from torchtune.models.clip.inference._transforms import _CLIPImageTransform
-
-from .passes.replace_custom_ops_with_aten_ops_pass import (
-    ReplaceCustomOpsWithAtenOpsPass,
-)
+from torchtune.models.clip.inference._transform import _CLIPImageTransform
 
 
 def get_example_inputs() -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -46,6 +43,7 @@ def export_preprocess(
     max_num_tiles: int = 4,
     tile_size: int = 224,
     antialias: bool = False,
+    pad_max_tiles: bool = True,
 ) -> ExportedProgram:
 
     # Instantiate eager model.
@@ -56,10 +54,10 @@ def export_preprocess(
         max_num_tiles=max_num_tiles,
         tile_size=tile_size,
         antialias=antialias,
+        pad_max_tiles=pad_max_tiles,
     )
 
     # Replace non-exportable ops with custom ops.
-    image_transform_model.pad = torch.ops.preprocess.pad.default
     image_transform_model.tile_crop = torch.ops.preprocess.tile_crop.default
 
     # Export.
@@ -80,8 +78,10 @@ def lower_to_executorch_preprocess(
     edge_program = to_edge(
         exported_program, compile_config=EdgeCompileConfig(_check_ir_validity=False)
     )
-    # Replace custom ops with aten ops.
-    edge_program = edge_program.transform([ReplaceCustomOpsWithAtenOpsPass()])
 
-    et_program = edge_program.to_executorch(ExecutorchBackendConfig())
+    et_program = edge_program.to_executorch(
+        ExecutorchBackendConfig(
+            sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
+        )
+    )
     return et_program
diff --git a/examples/models/flamingo/preprocess/preprocess.cpp b/examples/models/flamingo/preprocess/preprocess.cpp
new file mode 100644
index 00000000000..ff46070f669
--- /dev/null
+++ b/examples/models/flamingo/preprocess/preprocess.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "executorch/examples/models/flamingo/preprocess/preprocess.h"
+
+#include <algorithm>
+#include <cassert>
+
+std::vector<int> _get_factors(int n) {
+  std::vector<int> factors;
+  for (int i = 1; i <= n; i++) {
+    if (n % i == 0) {
+      factors.push_back(i);
+    }
+  }
+  return factors;
+}
+
+std::vector<std::vector<int>> find_supported_resolutions(
+    int max_num_tiles,
+    int tile_size) {
+  std::vector<std::vector<int>> supported_resolutions;
+  for (int _tile_size = max_num_tiles; _tile_size > 0; _tile_size--) {
+    auto factors = _get_factors(_tile_size);
+    for (int i = 0; i < factors.size(); i++) {
+      int height = factors[i];
+      int width = _tile_size / factors[i];
+      supported_resolutions.push_back({height * tile_size, width * tile_size});
+    }
+  }
+  return supported_resolutions;
+}
+
+std::vector<int> get_canvas_best_fit(
+    std::vector<int> image_size,
+    std::vector<std::vector<int>> possible_resolutions,
+    bool resize_to_max_canvas) {
+  assert(image_size.size() == 2);
+  int image_h = image_size[0];
+  int image_w = image_size[1];
+
+  float best_scale = -0.1;
+  std::vector<int> best_resolution;
+  int best_area = 0;
+
+  for (int i = 0; i < possible_resolutions.size(); i++) {
+    assert(possible_resolutions[i].size() == 2);
+    float scale_h = possible_resolutions[i][0] / (float)image_h;
+    float scale_w = possible_resolutions[i][1] / (float)image_w;
+
+    // Get limiting side scaling -> no distortion
+    float scale = scale_h < scale_w ? scale_h : scale_w;
+
+    bool is_candidate = false;
+
+    if (scale >= 1.0) {
+      // Upscaling options.
+      if (resize_to_max_canvas) {
+        is_candidate = scale >= best_scale;
+      } else {
+        is_candidate = ((scale <= best_scale) || (best_resolution.size() == 0));
+      }
+    } else {
+      // If no upscaling options, find the minimum downscaling (max scale for
+      // scales < 1)
+      is_candidate = ((scale >= best_scale) || (best_resolution.size() == 0));
+    }
+
+    // Select the best resolution.
+    if (is_candidate) {
+      // @lint-ignore CLANGTIDY facebook-hte-ParameterUncheckedArrayBounds
+      int area = possible_resolutions[i][0] * possible_resolutions[i][1];
+      if (scale == best_scale) {
+        // If there are multiple resolutions, get the one with minimum area to
+        // reduce padding.
+        if (scale >= 1.0 && area < best_area) {
+          best_resolution = possible_resolutions[i];
+          best_area = area;
+        }
+      } else {
+        best_resolution = possible_resolutions[i];
+        best_scale = scale;
+        best_area = area;
+      }
+    }
+  }
+  return best_resolution;
+}
+
+std::vector<int> get_inscribed_size(
+    std::vector<int> image_size,
+    std::vector<int> target_size,
+    int max_size) {
+  assert(image_size.size() == 2);
+  assert(target_size.size() == 2);
+
+  int target_height = target_size[0];
+  int target_width = target_size[1];
+
+  if (max_size > 0) {
+    target_height = std::min(std::max(image_size[0], max_size), target_size[0]);
+    target_width = std::min(std::max(image_size[1], max_size), target_size[1]);
+  }
+
+  int resize_height = std::min(
+      (int)(image_size[0] * (target_width / (float)image_size[1])),
+      target_height);
+  int resize_width = std::min(
+      (int)(image_size[1] * (target_height / (float)image_size[0])),
+      target_width);
+
+  return {resize_height, resize_width};
+}
diff --git a/examples/models/flamingo/preprocess/preprocess.h b/examples/models/flamingo/preprocess/preprocess.h
new file mode 100644
index 00000000000..f6c7b813e95
--- /dev/null
+++ b/examples/models/flamingo/preprocess/preprocess.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <vector>
+
+// C++ implementation of the python functions in torchtune:
+// https://github.com/pytorch/torchtune/tree/main/torchtune/modules/transforms/vision_utils
+
+// Calculate all factors of a given number.
+std::vector<int> _get_factors(int n);
+
+// Computes all combinations of resolutions, multiple of tile_size,
+// that contain up to max_num_tiles. Useful for when dividing an image into
+// tiles. For example, if we want at most 2 tiles per image, then we can support
+// the following resolutions: (1x1, 1x2, 2x1) * tile_size Returns a vector of
+// tuples of (height, width).
+std::vector<std::vector<int>> find_supported_resolutions(
+    int max_num_tiles,
+    int tile_size);
+
+// Determines the best canvas possible from a list of possible resolutions to
+// resize an image to, without distortion.
+std::vector<int> get_canvas_best_fit(
+    std::vector<int> image_size,
+    std::vector<std::vector<int>> possible_resolutions,
+    bool resize_to_max_canvas);
+
+// Calculates the size of an image, if it was resized to be inscribed within the
+// target_size. It is upscaled or downscaled such that one size is equal to the
+// target_size, and the second size is less than or equal to the target_size.
+std::vector<int> get_inscribed_size(
+    std::vector<int> image_size,
+    std::vector<int> canvas_size,
+    int max_size);
diff --git a/examples/models/flamingo/preprocess/preprocess_test.cpp b/examples/models/flamingo/preprocess/preprocess_test.cpp
new file mode 100644
index 00000000000..deede877223
--- /dev/null
+++ b/examples/models/flamingo/preprocess/preprocess_test.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/models/flamingo/preprocess/preprocess.h>
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+
+// Mirror the torchtune python testing:
+// https://github.com/pytorch/torchtune/tree/main/tests/torchtune/modules/transforms
+
+void test_find_supported_resolutions(
+    int max_num_tiles,
+    int tile_size,
+    std::vector<std::vector<int>> expected_resolutions) {
+  std::vector<std::vector<int>> resolutions =
+      find_supported_resolutions(max_num_tiles, tile_size);
+
+  EXPECT_EQ(resolutions.size(), expected_resolutions.size());
+
+  for (int i = 0; i < resolutions.size(); i++) {
+    EXPECT_EQ(resolutions[i].size(), expected_resolutions[i].size());
+    EXPECT_EQ(resolutions[i][0], expected_resolutions[i][0]); // height
+    EXPECT_EQ(resolutions[i][1], expected_resolutions[i][1]); // width
+  }
+}
+
+TEST(PreprocessTest, TestFindSupportedResolution) {
+  test_find_supported_resolutions(1, 224, {{224, 224}});
+  test_find_supported_resolutions(2, 100, {{100, 200}, {200, 100}, {100, 100}});
+  test_find_supported_resolutions(
+      3, 50, {{50, 150}, {150, 50}, {50, 100}, {100, 50}, {50, 50}});
+  test_find_supported_resolutions(
+      4,
+      300,
+      {
+          {300, 1200},
+          {600, 600},
+          {1200, 300},
+          {300, 900},
+          {900, 300},
+          {300, 600},
+          {600, 300},
+          {300, 300},
+      });
+}
+
+void test_get_canvas_best_fit(
+    std::vector<int> image_size,
+    std::vector<std::vector<int>> possible_resolutions,
+    bool resize_to_max_canvas,
+    std::vector<int> expected_best_resolution) {
+  std::vector<int> best_resolution = get_canvas_best_fit(
+      image_size, possible_resolutions, resize_to_max_canvas);
+  EXPECT_EQ(best_resolution[0], expected_best_resolution[0]); // height
+  EXPECT_EQ(best_resolution[1], expected_best_resolution[1]); // width
+}
+
+TEST(PreprocessTest, TestGetCanvasBestFit_200x300_F) {
+  std::vector<std::vector<int>> possible_resolutions = {
+      {224, 896},
+      {448, 448},
+      {224, 224},
+      {896, 224},
+      {224, 672},
+      {672, 224},
+      {224, 448},
+      {448, 224},
+  };
+  test_get_canvas_best_fit(
+      {200, 300},
+      possible_resolutions,
+      false, // resize_to_max_canvas
+      {224, 448});
+
+  test_get_canvas_best_fit(
+      {200, 500},
+      possible_resolutions,
+      true, // resize_to_max_canvas
+      {224, 672});
+  test_get_canvas_best_fit(
+      {200, 200},
+      possible_resolutions,
+      false, // resize_to_max_canvas
+      {224, 224});
+  test_get_canvas_best_fit(
+      {200, 100},
+      possible_resolutions,
+      true, // resize_to_max_canvas
+      {448, 224});
+}
+
+void test_get_inscribed_size(
+    std::vector<int> image_size,
+    std::vector<int> target_size,
+    int max_size,
+    std::vector<int> expected_target_size) {
+  std::vector<int> result =
+      get_inscribed_size(image_size, target_size, max_size);
+  EXPECT_EQ(result[0], expected_target_size[0]); // height
+  EXPECT_EQ(result[1], expected_target_size[1]); // width
+}
+TEST(PreprocessTest, GetInscribedSize) {
+  test_get_inscribed_size({200, 100}, {1000, 1200}, 600, {600, 300});
+  test_get_inscribed_size({2000, 200}, {1000, 1200}, 2000, {1000, 100});
+  test_get_inscribed_size({400, 200}, {1000, 1200}, -1, {1000, 500});
+  test_get_inscribed_size({1000, 500}, {400, 300}, -1, {400, 200});
+}
diff --git a/examples/models/flamingo/preprocess/targets.bzl b/examples/models/flamingo/preprocess/targets.bzl
new file mode 100644
index 00000000000..fd60d94a907
--- /dev/null
+++ b/examples/models/flamingo/preprocess/targets.bzl
@@ -0,0 +1,20 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.cxx_library(
+        name = "preprocess",
+        srcs = ["preprocess.cpp"],
+        exported_headers = ["preprocess.h"],
+    )
+
+    runtime.cxx_test(
+        name = "preprocess_test",
+        srcs = ["preprocess_test.cpp"],
+        deps = [":preprocess"],
+    )
diff --git a/examples/models/flamingo/test_preprocess.py b/examples/models/flamingo/preprocess/test_preprocess.py
similarity index 70%
rename from examples/models/flamingo/test_preprocess.py
rename to examples/models/flamingo/preprocess/test_preprocess.py
index 896a01655e5..313097020a1 100644
--- a/examples/models/flamingo/test_preprocess.py
+++ b/examples/models/flamingo/preprocess/test_preprocess.py
@@ -13,22 +13,35 @@
 import PIL
 import torch
 
+from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
+from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
+from executorch.extension.pybindings.portable_lib import (
+    _load_for_executorch_from_buffer,
+)
+
 from parameterized import parameterized
 from PIL import Image
 
-from torchtune.models.clip.inference._transforms import (
+from torchtune.models.clip.inference._transform import (
     _CLIPImageTransform,
     CLIPImageTransform,
 )
 
-from torchtune.modules.transforms import (
+from torchtune.modules.transforms.vision_utils.get_canvas_best_fit import (
     find_supported_resolutions,
     get_canvas_best_fit,
+)
+
+from torchtune.modules.transforms.vision_utils.get_inscribed_size import (
     get_inscribed_size,
 )
 from torchvision.transforms.v2 import functional as F
 
-from .export_preprocess_lib import export_preprocess
+from .export_preprocess_lib import (
+    export_preprocess,
+    get_example_inputs,
+    lower_to_executorch_preprocess,
+)
 
 
 @dataclass
@@ -41,6 +54,7 @@ class PreprocessConfig:
     tile_size: int = 224
     max_num_tiles: int = 4
     possible_resolutions = None
+    pad_max_tiles: bool = True
 
 
 class TestImageTransform(unittest.TestCase):
@@ -74,6 +88,13 @@ def prepare_inputs(
             F.grayscale_to_rgb_image(F.to_image(image)), scale=True
         )
 
+        # The above converts the PIL image into a torchvision tv_tensor.
+        # Convert the tv_tensor into a torch.Tensor.
+        image_tensor = image_tensor + 0
+
+        # Ensure tensor is contiguous for executorch.
+        image_tensor = image_tensor.contiguous()
+
         # Calculate possible resolutions.
         possible_resolutions = config.possible_resolutions
         if possible_resolutions is None:
@@ -116,6 +137,17 @@ def prepare_inputs(
                 [1.0, 1.0],  # expected_tile_max
                 [0.0, 0.0],  # expected_tile_min
                 [1, 2],  # expected_aspect_ratio
+                False,  # pad_max_tiles
+            ),
+            (
+                (100, 400, 3),  # image_size
+                torch.Size([4, 3, 224, 224]),  # expected shape
+                False,  # resize_to_max_canvas
+                [0.2230, 0.1763, 0.0, 0.0],  # expected_tile_means
+                [1.0, 1.0, 0.0, 0.0],  # expected_tile_max
+                [0.0, 0.0, 0.0, 0.0],  # expected_tile_min
+                [1, 2],  # expected_aspect_ratio
+                True,  # pad_max_tiles
             ),
             (
                 (1000, 300, 3),  # image_size
@@ -125,6 +157,7 @@ def prepare_inputs(
                 [0.9976, 0.9940, 0.9936, 0.9906],  # expected_tile_max
                 [0.0037, 0.0047, 0.0039, 0.0],  # expected_tile_min
                 [4, 1],  # expected_aspect_ratio
+                False,  # pad_max_tiles
             ),
             (
                 (200, 200, 3),  # image_size
@@ -134,6 +167,7 @@ def prepare_inputs(
                 [0.9921, 0.9925, 0.9969, 0.9908],  # expected_tile_max
                 [0.0056, 0.0069, 0.0059, 0.0032],  # expected_tile_min
                 [2, 2],  # expected_aspect_ratio
+                False,  # pad_max_tiles
             ),
             (
                 (600, 200, 3),  # image_size
@@ -143,6 +177,17 @@ def prepare_inputs(
                 [1.0, 1.0, 1.0],  # expected_tile_max
                 [0.0, 0.0, 0.0],  # expected_tile_min
                 [3, 1],  # expected_aspect_ratio
+                False,  # pad_max_tiles
+            ),
+            (
+                (600, 200, 3),  # image_size
+                torch.Size([4, 3, 224, 224]),  # expected shape
+                False,  # resize_to_max_canvas
+                [0.4472, 0.4468, 0.3031, 0.0],  # expected_tile_means
+                [1.0, 1.0, 1.0, 0.0],  # expected_tile_max
+                [0.0, 0.0, 0.0, 0.0],  # expected_tile_min
+                [3, 1],  # expected_aspect_ratio
+                True,  # pad_max_tiles
             ),
         ]
     )
@@ -155,8 +200,11 @@ def test_preprocess(
         expected_tile_max: List[float],
         expected_tile_min: List[float],
         expected_ar: List[int],
+        pad_max_tiles: bool,
     ) -> None:
-        config = PreprocessConfig(resize_to_max_canvas=resize_to_max_canvas)
+        config = PreprocessConfig(
+            resize_to_max_canvas=resize_to_max_canvas, pad_max_tiles=pad_max_tiles
+        )
 
         reference_model = CLIPImageTransform(
             image_mean=config.image_mean,
@@ -167,6 +215,7 @@ def test_preprocess(
             tile_size=config.tile_size,
             max_num_tiles=config.max_num_tiles,
             possible_resolutions=None,
+            pad_max_tiles=config.pad_max_tiles,
         )
 
         eager_model = _CLIPImageTransform(
@@ -176,6 +225,7 @@ def test_preprocess(
             antialias=config.antialias,
             tile_size=config.tile_size,
             max_num_tiles=config.max_num_tiles,
+            pad_max_tiles=config.pad_max_tiles,
         )
 
         exported_model = export_preprocess(
@@ -185,6 +235,15 @@ def test_preprocess(
             antialias=config.antialias,
             tile_size=config.tile_size,
             max_num_tiles=config.max_num_tiles,
+            pad_max_tiles=config.pad_max_tiles,
+        )
+
+        executorch_model = lower_to_executorch_preprocess(exported_model)
+        executorch_module = _load_for_executorch_from_buffer(executorch_model.buffer)
+
+        aoti_path = torch._inductor.aot_compile(
+            exported_model.module(),
+            get_example_inputs(),
         )
 
         # Prepare image input.
@@ -216,8 +275,11 @@ def test_preprocess(
             self.assertAlmostEqual(tile.min().item(), expected_tile_min[i], delta=1e-4)
 
         # Check num tiles matches the product of the aspect ratio.
-        expected_num_tiles = reference_ar[0] * reference_ar[1]
-        self.assertEqual(expected_num_tiles, reference_image.shape[0])
+        if pad_max_tiles:
+            self.assertEqual(config.max_num_tiles, reference_image.shape[0])
+        else:
+            expected_num_tiles = reference_ar[0] * reference_ar[1]
+            self.assertEqual(expected_num_tiles, reference_image.shape[0])
 
         # Pre-work for eager and exported models. The reference model performs these
         # calculations and passes the result to _CLIPImageTransform, the exportable model.
@@ -225,20 +287,31 @@ def test_preprocess(
             image=image, config=config
         )
 
-        # Run eager and exported models.
+        # Run eager model and check it matches reference model.
         eager_image, eager_ar = eager_model(
             image_tensor, inscribed_size, best_resolution
         )
         eager_ar = eager_ar.tolist()
+        self.assertTrue(torch.allclose(reference_image, eager_image))
+        self.assertEqual(reference_ar, eager_ar)
 
+        # Run exported model and check it matches reference model.
         exported_image, exported_ar = exported_model.module()(
             image_tensor, inscribed_size, best_resolution
         )
         exported_ar = exported_ar.tolist()
-
-        # Check eager and exported models match reference model.
-        self.assertTrue(torch.allclose(reference_image, eager_image))
         self.assertTrue(torch.allclose(reference_image, exported_image))
+        self.assertEqual(reference_ar, exported_ar)
 
-        self.assertTrue(reference_ar, eager_ar)
-        self.assertTrue(reference_ar, exported_ar)
+        # Run executorch model and check it matches reference model.
+        et_image, et_ar = executorch_module.forward(
+            (image_tensor, inscribed_size, best_resolution)
+        )
+        self.assertTrue(torch.allclose(reference_image, et_image))
+        self.assertEqual(reference_ar, et_ar.tolist())
+
+        # Run aoti model and check it matches reference model.
+        aoti_model = torch._export.aot_load(aoti_path, "cpu")
+        aoti_image, aoti_ar = aoti_model(image_tensor, inscribed_size, best_resolution)
+        self.assertTrue(torch.allclose(reference_image, aoti_image))
+        self.assertEqual(reference_ar, aoti_ar.tolist())
diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
index 81089a438de..7a9b69d65b1 100644
--- a/examples/models/llama2/CMakeLists.txt
+++ b/examples/models/llama2/CMakeLists.txt
@@ -21,8 +21,6 @@ project(llama_runner)
 # Duplicating options as root CMakeLists.txt
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)
 
-option(EXECUTORCH_USE_TIKTOKEN "Use Tiktoken as a tokenizer" OFF)
-
 include(CMakeDependentOption)
 #
 # pthreadpool: build pthreadpool library. Disable on unsupported platforms
@@ -94,23 +92,6 @@ endif()
 
 # llama_runner library
 add_subdirectory(runner)
-if(EXECUTORCH_USE_TIKTOKEN)
-  # find RE2 for tokenizer
-  set(ABSL_ENABLE_INSTALL ON)
-  set(ABSL_PROPAGATE_CXX_STD ON)
-  set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
-  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/abseil-cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
-  )
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/re2
-    ${CMAKE_CURRENT_BINARY_DIR}/re2
-  )
-  set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
-  target_link_libraries(llama_runner PUBLIC re2::re2)
-endif()
 
 set(link_libraries gflags)
 set(_srcs main.cpp)
@@ -144,13 +125,7 @@ set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
 # Extra compile option and include dir for pthreadpool
 if(EXECUTORCH_BUILD_PTHREADPOOL)
   list(APPEND _common_compile_options -DET_USE_THREADPOOL)
-  list(APPEND link_libraries pthreadpool)
-  # These 2 source files are included in xnnpack_backend
-  if(NOT TARGET xnnpack_backend)
-    list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/threadpool.cpp
-         ${XNNPACK_ROOT}/threadpool/threadpool_guard.cpp
-    )
-  endif()
+  list(APPEND link_libraries extension_threadpool pthreadpool)
   list(APPEND _common_include_directories
        ${XNNPACK_ROOT}/third-party/pthreadpool/include
   )
@@ -158,8 +133,7 @@ endif()
 
 # Extra sources for cpuinfo
 if(EXECUTORCH_BUILD_CPUINFO)
-  list(APPEND link_libraries cpuinfo)
-  list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/cpuinfo_utils.cpp)
+  list(APPEND link_libraries extension_threadpool cpuinfo)
   list(APPEND _common_include_directories
        ${XNNPACK_ROOT}/third-party/cpuinfo/include
   )
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index 980a1831b0e..f09b6893b00 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -19,6 +19,14 @@ Please note that the models are subject to the [Llama 2 Acceptable Use Policy](h
 
 Since Llama 2 7B or Llama 3 8B model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized model.
 
+<p align="center">
+      <img src="./llama_via_xnnpack.gif" width=300>
+      <br>
+      <em>
+      Running Llama3.1 8B on Android phone
+      </em>
+</p>
+
 ## Quantization:
 We employed 4-bit groupwise per token dynamic quantization of all the linear layers of the model. Dynamic quantization refers to quantizating activations dynamically, such that quantization parameters for activations are calculated, from min/max range, at runtime. Here we quantized activations with 8bits (signed integer). Furthermore, weights are statically quantized. In our case weights were per-channel groupwise quantized with 4bit signed integer. For more information refer to this [page](https://github.com/pytorch/ao).
 
@@ -31,6 +39,12 @@ We evaluated WikiText perplexity using [LM Eval](https://github.com/EleutherAI/l
 
 Note that groupsize less than 128 was not enabled, since such models were still too large. This is because our current efforts have focused on enabling FP32 and support for FP16 is under way. What this implies for model size is that 1) embedding table is in FP32 and 2) quantized weights scales are FP32.
 
+### SpinQuant (Optional)
+
+To improve accuracy, we can use [SpinQuant](https://github.com/facebookresearch/SpinQuant/tree/main), a post-training quantization (PTQ) technique that generates new quantized weights. In the standard PTQ process, quantization may lead to a decrease in accuracy when there are outliers. The SpinQuant method takes the original weights and produces optimized quantized weights with minimal outliers, resulting in higher accuracy. This can be achieved without any finetuning of the weights and only requires 100 iterations on a single A100 node.
+
+SpinQuant can generate quantized weights that are [compatible with ExecuTorch](https://github.com/facebookresearch/SpinQuant/tree/main?tab=readme-ov-file#3-export-to-executorch), specifically, it can be integrated with the existing optimized XNNPACK kernels (aka group-wise 4bit weight and 8bit dynamic activation). This allows developers to benefit from the higher accuracy of SpinQuant while also taking advantage of the strong performance of ExecuTorch acceleration. We are currently working on enabling SpinQuant for the Llama3.1 8B model on ExecuTorch.
+
 ## Enablement
 
 We have verified running Llama 2 7B [mobile applications](#step-6-build-mobile-apps) efficiently on select devices including the iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22 and S24, and OnePlus 12.
@@ -60,7 +74,7 @@ Note that since Llama3's vocabulary size is 4x that of Llama2, we had to quantiz
 |OnePlus 12 | 10.85 tokens/second | 11.02 tokens/second |
 
 ### Llama3.1
-> :warning: **use the main branch**: Llama3.1 is supported on the ExecuTorch main branch (not release 0.3).
+Llama3.1 is supported on the ExecuTorch main branch and release/0.4
 
 # Instructions
 
@@ -111,7 +125,7 @@ If you want to deploy and run a smaller model for educational purposes. From `ex
     ```
 3. Export model and generate `.pte` file.
     ```
-    python -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -X
+    python -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -X -kv
     ```
 4. Create tokenizer.bin.
 
@@ -132,6 +146,8 @@ You can export and run the original Llama 3 8B instruct model.
 
     Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size.
 
+3. SpinQuant [Optional]. If you want to improve accuracy, you can use [SpinQuant](https://github.com/facebookresearch/SpinQuant). Namely, (1) you can generate a new checkpoint via `31_optimize_rotation_executorch.sh` and `32_eval_ptq_executorch.sh` commands in [SpinQuant repo](https://github.com/facebookresearch/SpinQuant/tree/main?tab=readme-ov-file#3-export-to-executorch) (2) pass in an extra `--use_spin_quant native` argument in `export_llama` script above.
+
 ### Option D: Download models from Hugging Face and convert from safetensor format to state dict
 
 You can also download above models from [Hugging Face](https://huggingface.co/). Since ExecuTorch starts from a PyTorch model, a script like below can be used to convert the Hugging Face safetensors format to PyTorch's state dict. It leverages the utils provided by [TorchTune](https://github.com/pytorch/torchtune).
@@ -200,8 +216,9 @@ The Wikitext results generated above used: `{max_seq_len: 2048, limit: 1000}`
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DEXECUTORCH_ENABLE_LOGGING=1 \
         -DCMAKE_BUILD_TYPE=Release \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
@@ -227,8 +244,6 @@ Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the
     cmake --build cmake-out/examples/models/llama2 -j16 --config Release
     ```
 
-For Llama3, add `-DEXECUTORCH_USE_TIKTOKEN=ON` option when building the llama runner.
-
 3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/main.cpp#L18-L40).
     ```
     cmake-out/examples/models/llama2/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.bin> --prompt=<prompt>
@@ -253,8 +268,9 @@ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
     -DANDROID_PLATFORM=android-23 \
     -DCMAKE_INSTALL_PREFIX=cmake-out-android \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_ENABLE_LOGGING=1 \
     -DPYTHON_EXECUTABLE=python \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
@@ -283,7 +299,6 @@ cmake  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
 
 cmake --build cmake-out-android/examples/models/llama2 -j16 --config Release
 ```
-For Llama3, add `-DEXECUTORCH_USE_TIKTOKEN=ON` option when building the llama runner.
 
 **2. Run on Android via adb shell**
 
@@ -316,9 +331,9 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction
 for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is
 
-- Lower to CoreML: `python -m examples.models.llama2.export_llama -kv --coreml -c stories110M.pt -p params.json`
-- MPS: `python -m examples.models.llama2.export_llama -kv --mps -c stories110M.pt -p params.json`
-- QNN: `python -m examples.models.llama2.export_llama -kv --qnn -c stories110M.pt -p params.json`
+- Lower to CoreML: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json `
+- MPS: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json `
+- QNN: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json `
 
 The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run.
 
diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS
index 9bdbff5fbb3..7ed858a33c5 100644
--- a/examples/models/llama2/TARGETS
+++ b/examples/models/llama2/TARGETS
@@ -70,9 +70,13 @@ runtime.python_library(
         "export_llama.py",
         "export_llama_lib.py",
         "model.py",
+        "source_transformation/apply_spin_quant_r1_r2.py",
+        "source_transformation/prune_output.py",
         "source_transformation/quantize.py",
+        "source_transformation/rms_norm.py",
         "source_transformation/rope.py",
         "source_transformation/sdpa.py",
+        "source_transformation/spin_quant.py",
     ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama2",
@@ -83,6 +87,7 @@ runtime.python_library(
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
+        "//ai_codesign/gen_ai/fast_hadamard_transform:fast_hadamard_transform",
         "//caffe2:torch",
         "//executorch/examples/models:model_base",
         "//executorch/examples/models:models",
@@ -93,7 +98,7 @@ runtime.python_library(
         # "//executorch/extension/pybindings:aten_lib",
         # "//executorch/extension/pybindings:portable_lib",
         # "//executorch/extension/pybindings:portable_lib_plus_custom",
-        "//executorch/sdk/etrecord:etrecord",
+        "//executorch/devtools/etrecord:etrecord",
         "//executorch/util:memory_profiler",
         "//executorch/util:python_profiler",
         "fbsource//third-party/pypi/coremltools:coremltools",
diff --git a/examples/models/llama2/eval_llama.py b/examples/models/llama2/eval_llama.py
index 0495c76bbf1..4daeaf7afa5 100644
--- a/examples/models/llama2/eval_llama.py
+++ b/examples/models/llama2/eval_llama.py
@@ -22,6 +22,8 @@ def main() -> None:
     modelname = "llama2"
     parser = build_args_parser()
     args = parser.parse_args()
+    # Overrides this arg, because evaluation requires full logits.
+    args.generate_full_logits = True
     eval_llama(modelname, args)
 
 
diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py
index 5959ba6a386..b8987ac5d49 100644
--- a/examples/models/llama2/eval_llama_lib.py
+++ b/examples/models/llama2/eval_llama_lib.py
@@ -29,6 +29,56 @@
 )
 
 
+class GraphModuleEvalWrapper(EagerEvalWrapper):
+    """
+    A wrapper class for ExecuTorch py-binded integration with the
+    lm-evaluation-harness library.
+    """
+
+    def __init__(
+        self,
+        model: torch.fx.GraphModule,
+        tokenizer: Union[SentencePieceTokenizer, Tiktoken],
+        max_seq_length: Optional[int] = None,
+        use_kv_cache: bool = False,
+        generate_full_logits: bool = False,
+        enable_dynamic_shape: bool = True,
+    ):
+        super().__init__(
+            model=model, tokenizer=tokenizer, max_seq_length=max_seq_length
+        )
+        self._model = model.to(self.device)
+        self._use_kv_cache = use_kv_cache
+        self._generate_full_logits = generate_full_logits
+        self._enable_dynamic_shape = enable_dynamic_shape
+
+    def _model_call(self, inps):
+        if self._use_kv_cache:
+            if not self._enable_dynamic_shape:
+                # graph module exported without dynamic shape won't work with a different shape.
+                # And we have to do single token prefill here.
+                result_logits = []
+                for pos in range(inps.shape[-1]):
+                    pos_tensor = torch.tensor([pos], dtype=torch.int64)
+                    logits = self._model(inps[:, pos : pos + 1], pos_tensor)
+                    result_logits.append(logits)
+                if self._generate_full_logits:
+                    return torch.cat(result_logits, dim=1)
+                else:
+                    return torch.stack(result_logits, dim=1)
+            else:
+                pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device)
+                # Batch process the whole sequence.
+                logits = self._model(inps[:, : self._max_seq_length], pos_tensor)
+                return logits
+
+        else:
+            return self._model(inps)
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        raise Exception("unimplemented")
+
+
 class ETPybindEvalWrapper(EagerEvalWrapper):
     """
     A wrapper class for ExecuTorch py-binded integration with the
@@ -46,6 +96,15 @@ def __init__(
 
         from executorch.extension.pybindings.portable_lib import _load_for_executorch
 
+        # Load custom ops and quantized ops.
+        from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
+
+        # Note: import this after portable_lib
+        from executorch.extension.llm.custom_ops import (  # noqa
+            sdpa_with_kv_cache,  # usort: skip
+        )
+        from executorch.kernels import quantized  # noqa
+
         self._et_model = _load_for_executorch(self._model)
         self._use_kv_cache = self._et_model.run_method("use_kv_cache")[0]
 
@@ -53,15 +112,19 @@ def _model_call(self, inps):
         # Given inps (tokens), return the logits from a single forward call
         # inps: Tensor of shape (1, max_seq_len - 1)
         # logits: Tensor of shape (1, max_seq_len - 1, vocab_size)
+        result = []
         if self._use_kv_cache:
             pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device)
             result = self._et_model.forward(
                 (inps[:, : self._max_seq_length], pos_tensor)
             )
-            return result[0]
         else:
             result = self._et_model.forward((inps,))
-            return result[0]
+        if result[0].dim() != 3:
+            raise ValueError(
+                f"Dim of logits must be 3 for evaluation. Got {result[0].dim()} here. Add --generate_full_logits in export_llama to generate a pte file with full logits."
+            )
+        return result[0]
 
 
 class ETRunnerEvalWrapper(EagerEvalWrapper):
@@ -135,6 +198,13 @@ def gen_eval_wrapper(
             if torch.cuda.is_available()
             else manager.pre_autograd_graph_module.to(device="cpu")
         )
+        return GraphModuleEvalWrapper(
+            model=model,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            use_kv_cache=args.use_kv_cache,
+            enable_dynamic_shape=args.enable_dynamic_shape,
+        )
     else:
         # TODO: use manager.pre_autograd_graph_module for the eval to remove the if-else branch
         # for quantizers. Currently capture_pre_autograd_graph only works with --kv_cache, but
@@ -145,12 +215,21 @@ def gen_eval_wrapper(
             else manager.model.eval().to(device="cpu")
         )
 
-    return EagerEvalWrapper(
-        model=model,
-        tokenizer=tokenizer,
-        max_seq_length=args.max_seq_length,
-        use_kv_cache=args.use_kv_cache,
-    )
+        # Save the checkpoint after the eager model preparation is done.
+        # The reason for this option is that the checkpoint can be used
+        # to do evaluations in other evaluation platforms, or with data
+        # that is not available in this eval_llama. We save the checkpoint
+        # here for consistency with eval_llama. The accuracy results we
+        # get from eval_llama can be used as a reference to other evaluations.
+        if args.output_eager_checkpoint_file is not None:
+            torch.save(model, args.output_eager_checkpoint_file)
+
+        return EagerEvalWrapper(
+            model=model,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            use_kv_cache=args.use_kv_cache,
+        )
 
 
 def build_args_parser() -> argparse.ArgumentParser:
@@ -183,6 +262,12 @@ def build_args_parser() -> argparse.ArgumentParser:
         default=None,
         help="[For ExecuTorch] Path to the Tokenizer binary for evaluating ExecuTorch models via runtime",
     )
+    parser.add_argument(
+        "--output_eager_checkpoint_file",
+        type=str,
+        default=None,
+        help="Save the checkpoint after source transformations, for other evaluation platform to run the same checkpoint.",
+    )
 
     return parser
 
diff --git a/examples/models/llama2/evaluate/eager_eval.py b/examples/models/llama2/evaluate/eager_eval.py
index 28dbe9381ad..e8a540f95e2 100644
--- a/examples/models/llama2/evaluate/eager_eval.py
+++ b/examples/models/llama2/evaluate/eager_eval.py
@@ -99,7 +99,7 @@ def evaluate_model(
 
     Args:
         eval_wrapper (LM): A LM wrapper class compatible with lm-evaluation-harness evaluation
-        task (str): The name of the evaluation task to perform.
+        tasks: Optional[list]: The names of the evaluation tasks to perform.
         limit (Optional[int]): The maximum number of samples to evaluate (None for all available).
 
     Returns:
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index c22c0a3c3ce..04ccbcdea08 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -16,12 +16,14 @@
 from enum import Enum
 from json import JSONDecodeError
 from pathlib import Path
-from typing import Optional, Union
+from typing import Callable, List, Optional, Union
 
 import pkg_resources
 
 import torch
 
+from executorch.devtools.etrecord import generate_etrecord
+
 from executorch.examples.models.llama2.llama_transformer import ModelArgs
 
 from executorch.extension.llm.export.builder import DType, LLMEdgeManager
@@ -40,19 +42,24 @@
     get_pt2e_quantizers,
     get_qnn_quantizer,
 )
-
-from executorch.sdk.etrecord import generate_etrecord
 from executorch.util.activation_memory_profiler import generate_memory_trace
 
 from ..model_factory import EagerModelFactory
+from .source_transformation.apply_spin_quant_r1_r2 import (
+    fuse_layer_norms,
+    get_model_with_r1_r2,
+)
 from .source_transformation.quantize import (
     get_quant_embedding_transform,
     get_quant_weight_transform,
 )
+from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm
 from .source_transformation.rope import materialze_broadcast_of_rope_freq_cis
 from .source_transformation.sdpa import (
     replace_causal_mask,
+    replace_kv_cache_with_coreml_kv_cache,
     replace_kv_cache_with_simple_kv_cache,
+    replace_sdpa_with_coreml_sdpa,
     replace_sdpa_with_custom_op,
     replace_sdpa_with_flex_sdpa,
     replace_sdpa_with_simple_sdpa,
@@ -166,19 +173,25 @@ def build_args_parser() -> argparse.ArgumentParser:
         nargs="+",
         type=str,
         default=None,
-        help="Tasks for GPTQ calibration",
+        help="Tasks for GPTQ calibration from lm_eval",
     )
     parser.add_argument(
         "--calibration_limit",
         type=int,
         default=None,
-        help="number of samples used for calibration",
+        help="number of samples used for calibration from lm_eval",
     )
     parser.add_argument(
         "--calibration_seq_length",
         type=int,
         default=None,
-        help="Sequence length for GPTQ calibration",
+        help="Sequence length for GPTQ calibration from lm_eval",
+    )
+    parser.add_argument(
+        "--calibration_data",
+        type=str,
+        default="Once upon a time",
+        help="Calibration prompts from users",
     )
     parser.add_argument(
         "-t",
@@ -193,6 +206,12 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Whether or not to export a model using kv cache",
     )
+    parser.add_argument(
+        "--num_sharding",
+        type=int,
+        default=0,
+        help="Specify the number of splits by inserting the fallback custom op. The graph will be split evenly by layers.",
+    )
     parser.add_argument(
         "--use_sdpa_with_kv_cache",
         default=False,
@@ -212,6 +231,13 @@ def build_args_parser() -> argparse.ArgumentParser:
         default=f"{ckpt_dir}/params/demo_config.json",
         help="config.json",
     )
+    parser.add_argument(
+        "--optimized_rotation_path",
+        default=None,
+        required=False,
+        help="[QNN backend] Optimized rotation checkpoint path. Just apply R1/R2 here."
+        "You can download the optimized rotation matrices from https://github.com/facebookresearch/SpinQuant/tree/main",
+    )
     parser.add_argument(
         "-m",
         "--metadata",
@@ -250,9 +276,9 @@ def build_args_parser() -> argparse.ArgumentParser:
         "--dtype-override",
         default="fp32",
         type=str,
-        choices=["fp32", "fp16"],
+        choices=["fp32", "fp16", "bf16"],
         help="Override the dtype of the model (default is the checkpoint dtype)."
-        "Options: fp32, fp16. Please be aware that only some backends support fp16.",
+        "Options: fp32, fp16, bf16. Please be aware that only some backends support fp16 and bf16.",
     )
 
     parser.add_argument(
@@ -275,6 +301,29 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument("-V", "--vulkan", action="store_true")
     parser.add_argument("--mps", action="store_true")
     parser.add_argument("--coreml", action="store_true")
+    parser.add_argument(
+        "--coreml-enable-state",
+        action="store_true",
+        help="This option is only for coreml, and is only supported for MacOS15+/iOS18+",
+    )
+    parser.add_argument(
+        "--coreml-preserve-sdpa",
+        action="store_true",
+        help="This option is only for coreml: Preserve sdpa in torch edge program to use coreml iOS18.sdpa op",
+    )
+    parser.add_argument(
+        "--coreml-quantize",
+        default=None,
+        choices=["b4w"],
+        help="This option is only for coreml: Use coreml quantization, e.g. b4w (for blockwise 4 bit weight)",
+    )
+    parser.add_argument(
+        "--coreml-ios",
+        type=int,
+        default=15,
+        choices=(15, 16, 17, 18),
+        help="This option is only for coreml: The minimum iOS version to deploy",
+    )
     parser.add_argument(
         "--qnn",
         action="store_true",
@@ -296,11 +345,40 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="Generate the ETRecord debug artifact.",
     )
 
+    parser.add_argument(
+        "--generate_full_logits",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Generate logits for all inputs.",
+    )
+
+    parser.add_argument(
+        "--soc_model",
+        help="[QNN backend] SoC model of current device. e.g. 'SM8650' for Snapdragon 8 Gen 3.",
+        type=str,
+        required=False,
+        default="SM8650",
+    )
+
+    parser.add_argument(
+        "-sq",
+        "--use_spin_quant",
+        type=str,
+        default=None,
+        choices=["cuda", "native"],
+        help="Use SpinQuant for better quantization performance. Only support cuda and native.",
+    )
+
+    parser.add_argument(
+        "--output_prune_map",
+        default=None,
+        help="path to the output pruning token mapping file (token_map.json)",
+    )
     return parser
 
 
 def canonical_path(path: Union[str, Path], *, dir: bool = False) -> str:
-
     path = str(path)
 
     if verbose_export():
@@ -368,35 +446,6 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
     else:
         dtype_override = None
 
-    # source transforms
-    transforms = []
-    if args.quantization_mode:
-        modelname = f"{modelname}_q"
-        transforms.append(
-            get_quant_weight_transform(args, dtype_override, verbose_export())
-        )
-
-    if args.embedding_quantize:
-        modelname = f"{modelname}_e"
-        transforms.append(get_quant_embedding_transform(args))
-
-    if args.expand_rope_table:
-        transforms.append(materialze_broadcast_of_rope_freq_cis)
-
-    if args.use_sdpa_with_kv_cache:
-        transforms.append(replace_sdpa_with_custom_op)
-
-    if args.use_kv_cache:
-        if args.qnn:
-            transforms.append(replace_kv_cache_with_simple_kv_cache)
-            transforms.append(replace_sdpa_with_flex_sdpa)
-            transforms.append(replace_causal_mask)
-
-        elif args.coreml or args.mps:
-            # Currently qnn/coreml/mps doesn't support sdpa op, use the simpler decomposition
-            # to get free perf gain.
-            transforms.append(replace_sdpa_with_simple_sdpa)
-            transforms.append(replace_causal_mask)
     return (
         _load_llama_model(
             modelname=modelname,
@@ -405,15 +454,23 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
             params_path=params_path,
             use_kv_cache=args.use_kv_cache,
             use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
+            generate_full_logits=args.generate_full_logits,
             weight_type=weight_type,
             enable_dynamic_shape=args.enable_dynamic_shape,
+            calibration_tasks=args.calibration_tasks,
+            calibration_limit=args.calibration_limit,
+            calibration_seq_length=args.calibration_seq_length,
+            calibration_data=args.calibration_data,
+            tokenizer_path=args.tokenizer_path,
             verbose=args.verbose,
             max_seq_len=args.max_seq_length,
+            output_prune_map_path=args.output_prune_map,
             metadata_str=args.metadata,
+            args=args,
         )
         .set_output_dir(output_dir_path)
         .to_dtype(dtype_override)
-        .source_transform(transforms)
+        .source_transform(_get_source_transforms(modelname, dtype_override, args))
     )
 
 
@@ -447,6 +504,9 @@ def _validate_args(args):
             " Please use --disable_dynamic_shape."
         )
 
+    if args.num_sharding > 0 and not args.qnn:
+        raise ValueError("Model shard is only supported with qnn backend now.")
+
 
 def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
     _validate_args(args)
@@ -487,17 +547,20 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
 
     if args.coreml:
         coreml_partitioner = get_coreml_partitioner(
-            args.use_kv_cache, args.pt2e_quantize
+            args.coreml_ios,
+            args.embedding_quantize,
+            args.pt2e_quantize,
+            args.coreml_quantize,
         )
         partitioners.append(coreml_partitioner)
         modelname = f"coreml_{modelname}"
 
     if args.qnn:
+        from executorch.extension.llm.custom_ops import model_sharding
+
         partitioners.append(
             get_qnn_partitioner(
-                quant_dtype,
-                args.use_kv_cache,
-                args.pt2e_quantize,
+                args.use_kv_cache, args.pt2e_quantize, args.num_sharding, args.soc_model
             )
         )
         # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
@@ -506,6 +569,13 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
         # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`, Optional type has no attribute `exported_program`
         _transform(builder_exported_to_edge.edge_manager.exported_program())
 
+        if args.num_sharding > 0:
+            model_sharding.split_graph(
+                builder_exported_to_edge.edge_manager.exported_program(),
+                builder_exported_to_edge.metadata["get_n_layers"],
+                shares=args.num_sharding,
+            )
+
     if args.generate_etrecord:
         if not builder_exported_to_edge.edge_manager:
             raise ValueError("Unable to generate etrecord due to missing edge manager.")
@@ -513,7 +583,16 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
         logging.info("Generating etrecord")
         # Copy the edge manager which will be serialized into etrecord. This is memory-wise expensive.
         edge_manager_copy = copy.deepcopy(builder_exported_to_edge.edge_manager)
-        builder = builder_exported_to_edge.to_backend(partitioners).to_executorch()
+        builder = builder_exported_to_edge.to_backend(partitioners)
+        if args.num_sharding > 0 and args.qnn:
+            from executorch.backends.qualcomm.utils.utils import canonicalize_program
+
+            # TODO: Need to remove this once we have better way to handle buffer size
+            canonicalize_program(
+                builder.edge_manager.exported_program(), custom_buffer_size=542048256
+            )
+
+        builder = builder.to_executorch()
 
         # Generate ETRecord
         if edge_manager_copy:
@@ -524,7 +603,16 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
             )
             logging.info("Generated etrecord.bin")
     else:
-        builder = builder_exported_to_edge.to_backend(partitioners).to_executorch()
+        builder = builder_exported_to_edge.to_backend(partitioners)
+        if args.num_sharding > 0 and args.qnn:
+            from executorch.backends.qualcomm.utils.utils import canonicalize_program
+
+            # TODO: Need to remove this once we have better way to handle buffer size
+            canonicalize_program(
+                builder.edge_manager.exported_program(), custom_buffer_size=542048256
+            )
+
+        builder = builder.to_executorch()
 
     if args.profile_memory:
         generate_memory_trace(builder.export_program, "memory_profile.json")
@@ -567,6 +655,7 @@ def _load_llama_model_metadata(
         "get_max_seq_len": model_args.max_seq_len,
         "get_n_bos": 1,
         "get_n_eos": 2 if is_fairseq2 else 1,
+        "get_n_layers": model_args.n_layers,
         "get_vocab_size": model_args.vocab_size,
         "use_kv_cache": use_kv_cache,
         "use_sdpa_with_kv_cache": use_sdpa_with_kv_cache,
@@ -590,11 +679,19 @@ def _load_llama_model(
     params_path: str,
     use_kv_cache: bool = False,
     use_sdpa_with_kv_cache: bool = False,
+    generate_full_logits: bool = False,
     weight_type: WeightType = WeightType.LLAMA,
     enable_dynamic_shape: bool = False,
+    calibration_tasks: Optional[List[str]] = None,
+    calibration_limit: Optional[int] = None,
+    calibration_seq_length: Optional[int] = None,
+    calibration_data: Optional[str] = None,
+    tokenizer_path: Optional[str] = None,
     verbose: bool = False,
     max_seq_len: int = 128,
+    output_prune_map_path: Optional[str] = None,
     metadata_str: Optional[str] = None,
+    args,
 ) -> "LLMEdgeManager":
     """
     A helper util that builds a Llama2 model. It returns a LLMEdgeManager that
@@ -616,9 +713,12 @@ def _load_llama_model(
         params=params_path,
         use_kv_cache=use_kv_cache,
         use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
+        generate_full_logits=generate_full_logits,
         fairseq2=weight_type == WeightType.FAIRSEQ2,
         max_seq_len=max_seq_len,
         enable_dynamic_shape=enable_dynamic_shape,
+        output_prune_map_path=output_prune_map_path,
+        args=args,
     )
     state_dict = model.state_dict()
     dtype = state_dict[next(iter(state_dict))].dtype
@@ -644,8 +744,14 @@ def _load_llama_model(
         max_seq_len=model.params.max_seq_len,
         dtype=dtype,
         use_kv_cache=use_kv_cache,
+        generate_full_logits=generate_full_logits,
         example_inputs=example_inputs,
         enable_dynamic_shape=enable_dynamic_shape,
+        calibration_tasks=calibration_tasks,
+        calibration_limit=calibration_limit,
+        calibration_seq_length=calibration_seq_length,
+        calibration_data=calibration_data,
+        tokenizer_path=tokenizer_path,
         verbose=verbose,
         metadata=_load_llama_model_metadata(
             weight_type,
@@ -655,4 +761,79 @@ def _load_llama_model(
             model.params,
             metadata_str,
         ),
+        args=args,
     )
+
+
+def _get_source_transforms(  # noqa
+    modelname: str, dtype_override: Optional[DType], args
+) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
+    transforms = []
+    if args.quantization_mode:
+        modelname = f"{modelname}_q"
+        if args.use_spin_quant is None:
+            transforms.append(
+                get_quant_weight_transform(args, dtype_override, verbose_export())
+            )
+        # For SpinQuant, the checkpoints are already quantized
+        # aka the weights have corresponding scales value,
+        # So that means, we don't need to apply quantization
+        # transform. However, we will still need to apply
+        # transformations that change the model structure to
+        # match the checkpoint format.
+        # transform_for_spinquant() will apply these transformations
+        # later in model.py file.
+        elif args.use_spin_quant == "cuda":
+            from .source_transformation.spin_quant import (
+                inject_fast_hadamard_transform_cuda_for_spin_quant,
+            )
+
+            transforms.append(inject_fast_hadamard_transform_cuda_for_spin_quant)
+        elif args.use_spin_quant == "native":
+            from .source_transformation.spin_quant import (
+                inject_fast_hadamard_transform_native_for_spin_quant,
+            )
+
+            transforms.append(inject_fast_hadamard_transform_native_for_spin_quant)
+
+    if args.embedding_quantize:
+        modelname = f"{modelname}_e"
+        transforms.append(get_quant_embedding_transform(args))
+
+    if args.expand_rope_table:
+        transforms.append(materialze_broadcast_of_rope_freq_cis)
+
+    if args.use_sdpa_with_kv_cache:
+        transforms.append(replace_sdpa_with_custom_op)
+
+    if args.use_kv_cache:
+        if args.qnn:
+            # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
+            from executorch.backends.qualcomm.utils.utils import (
+                convert_linear_to_conv2d,
+            )
+
+            transforms.append(replace_kv_cache_with_simple_kv_cache)
+            transforms.append(replace_sdpa_with_flex_sdpa)
+            transforms.append(replace_causal_mask)
+            transforms.append(replace_rms_norm_with_native_rms_norm)
+            if args.optimized_rotation_path:
+                transforms.append(fuse_layer_norms)
+                transforms.append(get_model_with_r1_r2(args.optimized_rotation_path))
+            transforms.append(convert_linear_to_conv2d)
+
+        elif args.mps:
+            # Currently mps doesn't support sdpa op, use the simpler decomposition
+            # to get free perf gain.
+            transforms.append(replace_sdpa_with_simple_sdpa)
+            transforms.append(replace_causal_mask)
+
+        elif args.coreml:
+            # iOS 18 introduced fused sdpa op
+            if args.coreml_ios >= 18:
+                transforms.append(replace_sdpa_with_coreml_sdpa)
+            else:
+                transforms.append(replace_sdpa_with_simple_sdpa)
+            transforms.append(replace_kv_cache_with_coreml_kv_cache)
+
+    return transforms
diff --git a/examples/models/llama2/install_requirements.sh b/examples/models/llama2/install_requirements.sh
index 6b106c1c214..ddd29ad3f6f 100755
--- a/examples/models/llama2/install_requirements.sh
+++ b/examples/models/llama2/install_requirements.sh
@@ -8,7 +8,10 @@
 # Install snakeviz for cProfile flamegraph
 # Install sentencepiece for llama tokenizer
 pip install snakeviz sentencepiece
-pip install torchao==0.1
+
+# Install torchao.
+TORCHAO_VERSION=$(cat "$(dirname "$0")"/../../../.ci/docker/ci_commit_pins/torchao.txt)
+pip install --no-use-pep517 --user "git+https://github.com/pytorch/ao.git@${TORCHAO_VERSION}"
 
 # Install lm-eval for Model Evaluation with lm-evalution-harness
 # Install tiktoken for tokenizer
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
index 4ae12b0f647..65090e2fe5a 100644
--- a/examples/models/llama2/llama_transformer.py
+++ b/examples/models/llama2/llama_transformer.py
@@ -9,7 +9,7 @@
 
 from dataclasses import dataclass
 from functools import partial
-from typing import Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -39,6 +39,7 @@ def __init__(self, dim: int, eps: float = 1e-6):
 
         """
         super().__init__()
+        self.dim = dim
         self.eps = eps
         self.weight = nn.Parameter(torch.ones(dim))
 
@@ -96,7 +97,13 @@ class ModelArgs:
     use_sdpa_with_kv_cache_op: bool = (
         False  # Use custom sdpa op that updates kv cache in-place
     )
+    # Generate logits for all inputs. When it's True, it would take big memory usage
+    # at runtime. Enable it only necessary (e.g., use perplexity tools that requires
+    # logits for all input tokens.)
+    generate_full_logits: bool = False
     enable_dynamic_shape: bool = False  # export model with dynamic shape support
+    # A dictionary mapping from pruned token-id to original token-id
+    output_prune_map: Optional[Dict[int, int]] = None
     use_hf_rope: bool = False  # Use HuggingFace's RoPE implementation
     rope_theta: Optional[float] = (
         None  # The official name to override self.rope_freq_base.
@@ -213,9 +220,9 @@ def __init__(
     def forward(
         self,
         input_pos: torch.Tensor,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
+        q: torch.Tensor,  # Already have rotary embeddings. (bs, seqlen, n_local_heads, head_dim)
+        k: torch.Tensor,  # Already have rotary embeddings. (bs, seqlen, n_local_kv_heads, head_dim)
+        v: torch.Tensor,  # (bs, seqlen, n_local_kv_heads, head_dim)
         bsz,
         seqlen,
         mask: torch.Tensor,
@@ -442,7 +449,9 @@ def __init__(self, params: ModelArgs):
         self.norm = RMSNorm(params.dim, eps=params.norm_eps)
         self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
         self.use_kv_cache = params.use_kv_cache
+        self.generate_full_logits = params.generate_full_logits
         self.max_seq_len = params.max_seq_len
+        self.output_prune_map = params.output_prune_map
         if params.use_hf_rope:
             self.precompute_freqs_cis = hf_precompute_freqs_cis
         else:
@@ -512,7 +521,34 @@ def forward(
                 input_pos,
             )
 
+        if not self.generate_full_logits:
+            # Only the last logit is used for the new generated token
+            h = h[:, -1, :]
+
         h = self.norm(h)
 
         logits = self.output(h)
+
+        if self.output_prune_map is not None:
+            # expand to original size so that downstream applications can use the logits as-is.
+            if self.generate_full_logits:
+                # (1, seq_len, pruned_size) -> (1, seq_len, original_size)
+                expanded_logits = torch.full(
+                    [logits.shape[0], logits.shape[1], self.vocab_size],
+                    float("-inf"),
+                    device=logits.device,
+                    dtype=logits.dtype,
+                )
+                expanded_logits[:, :, list(self.output_prune_map.values())] = logits
+            else:
+                # (1, pruned_size) -> (1, original_size)
+                expanded_logits = torch.full(
+                    [logits.shape[0], self.vocab_size],
+                    float("-inf"),
+                    device=logits.device,
+                    dtype=logits.dtype,
+                )
+                expanded_logits[:, list(self.output_prune_map.values())] = logits
+            logits = expanded_logits
+
         return logits
diff --git a/examples/models/llama2/llama_via_xnnpack.gif b/examples/models/llama2/llama_via_xnnpack.gif
new file mode 100644
index 00000000000..046011f5f87
Binary files /dev/null and b/examples/models/llama2/llama_via_xnnpack.gif differ
diff --git a/examples/models/llama2/main.cpp b/examples/models/llama2/main.cpp
index 10a355a6037..c7ad96fac9d 100644
--- a/examples/models/llama2/main.cpp
+++ b/examples/models/llama2/main.cpp
@@ -11,8 +11,8 @@
 #include <executorch/examples/models/llama2/runner/runner.h>
 
 #if defined(ET_USE_THREADPOOL)
-#include <executorch/backends/xnnpack/threadpool/cpuinfo_utils.h>
-#include <executorch/backends/xnnpack/threadpool/threadpool.h>
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
+#include <executorch/extension/threadpool/threadpool.h>
 #endif
 
 DEFINE_string(
@@ -69,7 +69,7 @@ int32_t main(int32_t argc, char** argv) {
   }
 #endif
   // create llama runner
-  ::torch::executor::Runner runner(model_path, tokenizer_path, temperature);
+  example::Runner runner(model_path, tokenizer_path, temperature);
 
   // generate
   runner.generate(prompt, seq_len);
diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
index fdf0dc707e4..21714a9c159 100644
--- a/examples/models/llama2/model.py
+++ b/examples/models/llama2/model.py
@@ -61,9 +61,12 @@ def __init__(self, **kwargs):
 
         self.use_kv_cache = kwargs.get("use_kv_cache", False)
         self.use_sdpa_with_kv_cache_op = kwargs.get("use_sdpa_with_kv_cache", False)
+        self.generate_full_logits = kwargs.get("generate_full_logits", False)
         self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False)
+        self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
 
         self.max_seq_len = kwargs.get("max_seq_len", 128)
+        self.args = kwargs.get("args", None)
         # The example is using a dummy small model with random weights for demo purpose only.
         # Follow the instruction in https://github.com/facebookresearch/llama to download the model
         device = "cpu"
@@ -125,7 +128,8 @@ def __init__(self, **kwargs):
         # get checkpoint dtype
         self.dtype = None
         if len(checkpoint) > 0:
-            first = checkpoint[next(iter(checkpoint))]
+            first_key = next(iter(checkpoint))
+            first = checkpoint[first_key]
             self.dtype = first.dtype
             mismatched_dtypes = [
                 (key, value.dtype)
@@ -134,10 +138,16 @@ def __init__(self, **kwargs):
             ]
             if len(mismatched_dtypes) > 0:
                 print(
-                    f"Mixed dtype model. Dtype of {first.key}: {first.dtype}. Mismatches in the checkpoint: {mismatched_dtypes}"
+                    f"Mixed dtype model. Dtype of {first_key}: {first.dtype}. Mismatches in the checkpoint: {mismatched_dtypes}"
                 )
         with open(params_path, "r") as f:
             params = json.loads(f.read())
+        output_prune_map = None
+        if self.output_prune_map_path is not None:
+            with open(self.output_prune_map_path, "r") as f:
+                output_prune_map = json.load(f)
+            # change keys from string to int (json only supports string keys)
+            output_prune_map = {int(k): v for (k, v) in output_prune_map.items()}
         max_seq_len = self.max_seq_len
         max_batch_size = 1
         model_args: ModelArgs = ModelArgs(
@@ -145,6 +155,8 @@ def __init__(self, **kwargs):
             max_batch_size=max_batch_size,
             use_kv_cache=self.use_kv_cache,
             use_sdpa_with_kv_cache_op=self.use_sdpa_with_kv_cache_op,
+            generate_full_logits=self.generate_full_logits,
+            output_prune_map=output_prune_map,
             enable_dynamic_shape=self.enable_dynamic_shape,
             **params,
         )
@@ -177,15 +189,60 @@ def __init__(self, **kwargs):
             self.model_ = Int8DynActInt4WeightQuantizer()._convert_for_runtime(
                 self.model_
             )
+        elif hasattr(self.args, "use_spin_quant") and self.args.use_spin_quant:
+            print("Using SPIN quantization.")
+            assert hasattr(self.args, "group_size"), "group_size must be specified"
+            assert hasattr(
+                self.args, "quantization_mode"
+            ), "quantization_mode must be specified"
+            assert hasattr(
+                self.args, "dtype_override"
+            ), "dtype_override must be specified"
+            from .source_transformation.spin_quant import (
+                sanitize_checkpoint_from_spinquant,
+                transform_for_spinquant,
+            )
+
+            mapping = {
+                "fp32": torch.float32,
+                "fp16": torch.float16,
+                "bf16": torch.bfloat16,
+            }
+
+            self.model_ = transform_for_spinquant(
+                self.model_,
+                checkpoint,
+                self.args.group_size,
+                self.args.quantization_mode,
+                mapping[self.args.dtype_override],
+            )
+
+            sanitize_checkpoint_from_spinquant(
+                checkpoint,
+                self.args.group_size,
+            )
 
         # assign=True: load params/buffers by assignment instead of performing an in-place copy.
         # Because we are using device="meta", tensors do not have memory associated with them
         # and an in-place copy is a no-op. Use assign=True in load_state_dict for this scenario.
-        self.model_.load_state_dict(
+        missing, unexpected = self.model_.load_state_dict(
             checkpoint,
             strict=False,
             assign=True,
         )  # self.model_ = Transformer(gptconf)
+        if kwargs.get("verbose", False):
+            print("============= missing keys ================")
+            print(missing)
+            print("============= /missing ================")
+            print("============= unexpected keys ================")
+            print(unexpected)
+            print("============= /unexpected ================")
+
+        # prune the output layer if output_prune_map is provided
+        if output_prune_map is not None:
+            from .source_transformation.prune_output import prune_output_vocab
+
+            self.model_ = prune_output_vocab(self.model_, output_prune_map)
 
     def get_eager_model(self):
         if self.dtype:
diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt
index 2c9696f69eb..79fcd267af0 100644
--- a/examples/models/llama2/runner/CMakeLists.txt
+++ b/examples/models/llama2/runner/CMakeLists.txt
@@ -41,15 +41,13 @@ target_include_directories(
   extension_module INTERFACE ${_common_include_directories}
 )
 
-if(EXECUTORCH_USE_TIKTOKEN)
-  list(APPEND _llama_runner__srcs
-       ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
-  )
-  list(APPEND _llama_runner__srcs
-       ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp
-  )
-  set(_preprocessor_flag -DET_USE_TIKTOKEN)
-endif()
+list(
+  APPEND _llama_runner__srcs
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
+)
+list(APPEND _llama_runner__srcs
+     ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp
+)
 
 if(CMAKE_TOOLCHAIN_IOS
    OR ANDROID
@@ -62,7 +60,24 @@ else()
   add_library(llama_runner SHARED ${_llama_runner__srcs})
 endif()
 
-set(llama_runner_deps executorch extension_module extension_data_loader)
+# find RE2 for tokenizer, build tiktoken
+set(ABSL_ENABLE_INSTALL ON)
+set(ABSL_PROPAGATE_CXX_STD ON)
+set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+add_subdirectory(
+  ${EXECUTORCH_ROOT}/extension/llm/third-party/abseil-cpp
+  ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
+)
+add_subdirectory(
+  ${EXECUTORCH_ROOT}/extension/llm/third-party/re2
+  ${CMAKE_CURRENT_BINARY_DIR}/re2
+)
+set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
+
+set(llama_runner_deps executorch extension_data_loader extension_module
+                      extension_tensor re2::re2
+)
 
 target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})
 
diff --git a/examples/models/llama2/runner/generation.py b/examples/models/llama2/runner/generation.py
index f1a6b54d88f..6d43c84932f 100644
--- a/examples/models/llama2/runner/generation.py
+++ b/examples/models/llama2/runner/generation.py
@@ -12,11 +12,6 @@
 from executorch.examples.models.llama2.llama_transformer import ModelArgs
 from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer
 
-from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
-
-# Note: import this after portable_lib
-from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
-
 
 class CompletionPrediction(TypedDict, total=False):
     generation: str
diff --git a/examples/models/llama2/runner/native.py b/examples/models/llama2/runner/native.py
index cefafc1a88d..b0d6c20e961 100644
--- a/examples/models/llama2/runner/native.py
+++ b/examples/models/llama2/runner/native.py
@@ -13,6 +13,13 @@
 from examples.models.llama2.llama_transformer import ModelArgs
 from executorch.extension.pybindings.portable_lib import _load_for_executorch
 
+# Load custom ops and quantized ops.
+from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
+
+# Note: import this after portable_lib
+from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
+from executorch.kernels import quantized  # noqa
+
 from .generation import LlamaRunner
 
 
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
index dd0a305a371..764f4f7f68c 100644
--- a/examples/models/llama2/runner/runner.cpp
+++ b/examples/models/llama2/runner/runner.cpp
@@ -14,15 +14,18 @@
 #include <ctime>
 
 #include <executorch/extension/llm/runner/util.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 
-#if ET_USE_TIKTOKEN
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
-#else /* BPE */
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
-#endif /* ET_USE_TIKTOKEN*/
 
-namespace torch::executor {
+namespace example {
+
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+namespace llm = ::executorch::extension::llm;
+
 namespace {
 static constexpr auto kAppendEosToPrompt = "append_eos_to_prompt";
 static constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
@@ -46,13 +49,6 @@ Runner::Runner(
     : temperature_(temperature),
       module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
       tokenizer_path_(tokenizer_path),
-      tokenizer_(
-#if ET_USE_TIKTOKEN
-          get_tiktoken_for_llama()
-#else
-          std::make_unique<BPETokenizer>()
-#endif
-              ),
       metadata_({
           {kAppendEosToPrompt, false},
           {kEnableDynamicShape, false},
@@ -79,8 +75,21 @@ Error Runner::load() {
     return Error::Ok;
   }
   ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
-
-  tokenizer_->load(tokenizer_path_);
+  // load tokenizer. Assuming tiktoken is the default tokenizer
+  tokenizer_ = nullptr;
+  tokenizer_ = get_tiktoken_for_llama();
+  Error err = tokenizer_->load(tokenizer_path_);
+  // Rely on tiktoken to throw error if the artifact is incompatible. Then we
+  // fallback to BPE tokenizer.
+  if (err == Error::InvalidArgument) {
+    ET_LOG(
+        Info,
+        "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
+        tokenizer_path_.c_str());
+    tokenizer_.reset();
+    tokenizer_ = std::make_unique<llm::BPETokenizer>();
+    tokenizer_->load(tokenizer_path_);
+  }
 
   ET_LOG(Info, "Reading metadata from model");
 
@@ -117,18 +126,17 @@ Error Runner::load() {
       ET_LOG(Info, "eos_id = %" PRId64, value);
     }
   }
-  text_decoder_runner_ = std::make_unique<TextDecoderRunner>(
+  text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
       module_.get(),
       metadata_.at(kUseKVCache),
       metadata_.at(kVocabSize),
       temperature_);
-  text_prefiller_ = std::make_unique<TextPrefiller>(
-      tokenizer_.get(),
+  text_prefiller_ = std::make_unique<llm::TextPrefiller>(
       text_decoder_runner_.get(),
       metadata_.at(kUseKVCache),
-      enable_parallel_prefill_);
+      metadata_.at(kEnableDynamicShape));
 
-  text_token_generator_ = std::make_unique<TextTokenGenerator>(
+  text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
       tokenizer_.get(),
       text_decoder_runner_.get(),
       metadata_.at(kUseKVCache),
@@ -142,20 +150,26 @@ Error Runner::generate(
     const std::string& prompt,
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
+    std::function<void(const llm::Stats&)> stats_callback,
+    bool echo) {
   // Prepare the inputs.
   // Use ones-initialized inputs.
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
   if (!is_loaded()) {
-    stats_.model_load_start_ms = util::time_in_ms();
+    stats_.model_load_start_ms = llm::time_in_ms();
     ET_CHECK_OK_OR_RETURN_ERROR(load());
-    stats_.model_load_end_ms = util::time_in_ms();
+    stats_.model_load_end_ms = llm::time_in_ms();
   }
 
+  ET_LOG(
+      Info,
+      "RSS after loading model: %f MiB (0 if unsupported)",
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
+
   // Wrap the token_callback with print function
   std::function<void(const std::string&)> wrapped_callback =
       [token_callback](const std::string& piece) {
-        util::safe_printf(piece.c_str());
+        llm::safe_printf(piece.c_str());
         fflush(stdout);
         if (token_callback) {
           token_callback(piece);
@@ -164,7 +178,7 @@ Error Runner::generate(
   // First token time only measures the time it takes to encode the prompt and
   // return a response token.
 
-  stats_.inference_start_ms = util::time_in_ms();
+  stats_.inference_start_ms = llm::time_in_ms();
   shouldStop_ = false;
 
   // Set the sequence length to the max seq length if not provided
@@ -200,23 +214,36 @@ Error Runner::generate(
   // Prefill first
   // Here feed all tokens to the model and get the next predicted token
   // after the prompt. After that we will enter generate loop.
-  auto prefill_res =
-      text_prefiller_->prefill(prompt_tokens, 0, wrapped_callback);
-  stats_.first_token_ms = util::time_in_ms();
-  stats_.prompt_eval_end_ms = util::time_in_ms();
+
+  // print prompts
+  if (echo) {
+    wrapped_callback(prompt);
+  }
+  int64_t pos = 0;
+  auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos);
+  stats_.first_token_ms = llm::time_in_ms();
+  stats_.prompt_eval_end_ms = llm::time_in_ms();
   ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
   uint64_t cur_token = prefill_res.get();
 
   // print the first token from prefill. No prev_token so use cur_token for it.
   wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
+  ET_LOG(
+      Info,
+      "RSS after prompt prefill: %f MiB (0 if unsupported)",
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   // start the main loop
   prompt_tokens.push_back(cur_token);
   int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
       prompt_tokens, num_prompt_tokens, seq_len, wrapped_callback));
 
-  stats_.inference_end_ms = util::time_in_ms();
+  stats_.inference_end_ms = llm::time_in_ms();
   printf("\n");
+  ET_LOG(
+      Info,
+      "RSS after finishing text generation: %f MiB (0 if unsupported)",
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   if (num_prompt_tokens + num_generated_tokens == seq_len) {
     ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
@@ -239,4 +266,4 @@ void Runner::stop() {
     ET_LOG(Error, "Token generator is not loaded, cannot stop");
   }
 }
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
index 12fb63c6f34..72d0ea12a50 100644
--- a/examples/models/llama2/runner/runner.h
+++ b/examples/models/llama2/runner/runner.h
@@ -24,8 +24,7 @@
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 
-namespace torch::executor {
-using Stats = ::executorch::llm::Stats;
+namespace example {
 
 class Runner {
  public:
@@ -35,30 +34,33 @@ class Runner {
       const float temperature = 0.8f);
 
   bool is_loaded() const;
-  Error load();
-  Error generate(
+  ::executorch::runtime::Error load();
+  ::executorch::runtime::Error generate(
       const std::string& prompt,
       int32_t seq_len = 128,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {});
+      std::function<void(const ::executorch::extension::llm::Stats&)>
+          stats_callback = {},
+      bool echo = true);
   void stop();
 
  private:
   float temperature_;
-  bool enable_parallel_prefill_;
   bool shouldStop_{false};
 
   // model
-  std::unique_ptr<Module> module_;
+  std::unique_ptr<::executorch::extension::Module> module_;
   std::string tokenizer_path_;
-  std::unique_ptr<Tokenizer> tokenizer_;
+  std::unique_ptr<::executorch::extension::llm::Tokenizer> tokenizer_;
   std::unordered_map<std::string, int64_t> metadata_;
-  std::unique_ptr<TextDecoderRunner> text_decoder_runner_;
-  std::unique_ptr<TextPrefiller> text_prefiller_;
-  std::unique_ptr<TextTokenGenerator> text_token_generator_;
+  std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
+      text_decoder_runner_;
+  std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller_;
+  std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
+      text_token_generator_;
 
   // stats
-  Stats stats_;
+  ::executorch::extension::llm::Stats stats_;
 };
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
index 3ffc10421fc..96d47ffce21 100644
--- a/examples/models/llama2/runner/targets.bzl
+++ b/examples/models/llama2/runner/targets.bzl
@@ -4,13 +4,11 @@ def _get_operator_lib(aten = False):
     if aten:
         return ["//executorch/kernels/aten:generated_lib"]
     elif runtime.is_oss:
-        return ["//executorch/kernels/portable:generated_lib", "//executorch/extension/llm/custom_ops:custom_ops"]
+        # TODO(T183193812): delete this path after optimized-oss.yaml is no more.
+        return ["//executorch/configurations:optimized_native_cpu_ops_oss", "//executorch/extension/llm/custom_ops:custom_ops"]
     else:
         return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/extension/llm/custom_ops:custom_ops"]
 
-def use_tiktoken():
-    return native.read_config("llama", "use_tiktoken", "0") == "1"
-
 def define_common_targets():
     for aten in (True, False):
         aten_suffix = "_aten" if aten else ""
@@ -26,7 +24,6 @@ def define_common_targets():
             preprocessor_flags = [
                 "-DUSE_ATEN_LIB",
             ] if aten else [],
-            exported_preprocessor_flags = ["-DET_USE_TIKTOKEN"] if use_tiktoken() else [],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
@@ -38,16 +35,14 @@ def define_common_targets():
                 "//executorch/extension/llm/runner:text_prefiller" + aten_suffix,
                 "//executorch/extension/llm/runner:text_token_generator" + aten_suffix,
                 "//executorch/extension/evalue_util:print_evalue" + aten_suffix,
-                "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
                 "//executorch/extension/module:module" + aten_suffix,
+                "//executorch/extension/tensor:tensor" + aten_suffix,
                 "//executorch/kernels/quantized:generated_lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
-            ] + ([
                 "//executorch/examples/models/llama2/tokenizer:tiktoken",
-            ] if use_tiktoken() else [
                 "//executorch/extension/llm/tokenizer:bpe_tokenizer",
-            ]) + (_get_operator_lib(aten)) + ([
+            ] + (_get_operator_lib(aten)) + ([
                 # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
                 # Therefore enable it explicitly for now to avoid failing tests
                 "//executorch/backends/vulkan:vulkan_backend_lib",
diff --git a/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py b/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py
new file mode 100644
index 00000000000..e71007b1958
--- /dev/null
+++ b/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py
@@ -0,0 +1,179 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing
+
+import torch
+
+
+def rotate_embeddings(model, R1: torch.Tensor) -> None:
+    # Rotate the embeddings.
+    for W in [model.tok_embeddings]:
+        dtype = W.weight.data.dtype
+        W_ = W.weight.data.to(device="cpu", dtype=torch.float32)
+        W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype)
+
+
+def rotate_attention_inputs(layer, R1) -> None:
+    # Rotate the WQ, WK and WV matrices of the self-attention layer.
+    for W in [layer.attention.wq, layer.attention.wk, layer.attention.wv]:
+        dtype = W.weight.dtype
+        W_ = W.weight.to(device="cpu", dtype=torch.float32)
+        W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype)
+
+
+def rotate_attention_output(layer, R1) -> None:
+    # Rotate output matrix of the self-attention layer.
+    W = layer.attention.wo
+    dtype = W.weight.data.dtype
+    W_ = W.weight.data.to(device="cpu", dtype=torch.float32)
+    W.weight.data = torch.matmul(R1.T, W_).to(device="cpu", dtype=dtype)
+    if W.bias is not None:
+        b = W.bias.data.to(device="cpu", dtype=torch.float32)
+        W.bias.data = torch.matmul(R1.T, b).to(device="cpu", dtype=dtype)
+
+
+def rotate_mlp_input(layer, R1):
+    # Rotate the MLP input weights.
+    mlp_inputs = [layer.feed_forward.w3, layer.feed_forward.w1]
+    for W in mlp_inputs:
+        dtype = W.weight.dtype
+        W_ = W.weight.data.to(device="cpu", dtype=torch.float32)
+        W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype)
+
+
+def rotate_mlp_output(layer, R1):
+    # Rotate the MLP output weights and bias.
+    W = layer.feed_forward.w2
+    dtype = W.weight.data.dtype
+    W_ = W.weight.data.to(device="cpu", dtype=torch.float32)
+    W.weight.data = torch.matmul(R1.T, W_).to(device="cpu", dtype=dtype)
+
+    if W.bias is not None:
+        b = W.bias.data.to(device="cpu", dtype=torch.float32)
+        W.bias.data = torch.matmul(R1.T, b).to(device="cpu", dtype=dtype)
+
+
+def rotate_head(model, R1: torch.Tensor) -> None:
+    # Rotate the head.
+    W = model.output
+    dtype = W.weight.data.dtype
+    W_ = W.weight.data.to(device="cpu", dtype=torch.float32)
+    W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype)
+
+
+def rotate_ov_proj(layer, head_dim, R2=None):
+    W = layer.attention.wv
+    dtype = W.weight.data.dtype
+    W_ = W.weight.data.to(device="cpu", dtype=torch.float32).t()
+    transposed_shape = W_.shape
+    temp = W_.reshape(-1, transposed_shape[-1] // head_dim, head_dim)
+    temp = temp.to(torch.float32) @ R2
+    W_ = temp.reshape(transposed_shape).t()
+    W.weight.data = W_.to(device="cpu", dtype=dtype)
+
+    W = layer.attention.wo
+    dtype = W.weight.data.dtype
+    W_ = W.weight.data.to(device="cpu", dtype=torch.float32)
+    init_shape = W_.shape
+    temp = W_.reshape(-1, init_shape[-1] // head_dim, head_dim)
+    temp = temp.to(torch.float32) @ R2
+    W_ = temp.reshape(init_shape)
+    W.weight.data = W_.to(device="cpu", dtype=dtype)
+
+
+def cleanup_memory() -> None:
+    """Run GC and clear GPU memory."""
+    import gc
+
+    # gc.collect and empty cache are necessary to clean up GPU memory if the model was distributed
+    gc.collect()
+
+
+def get_model_with_r1_r2(optimized_rotation_path: str):
+    return lambda model: apply_spin_quant_r1_r2(model, optimized_rotation_path)
+
+
+def apply_spin_quant_r1_r2(model: torch.nn.Module, optimized_rotation_path: str):
+    optimized_rotation = torch.load(optimized_rotation_path, weights_only=True)
+    R1 = optimized_rotation["R1"].to(torch.float32)
+    config = model.params
+    num_heads = config.n_heads
+    head_dim = config.dim // num_heads
+
+    rotate_embeddings(model, R1)
+    rotate_head(model, R1)
+    cleanup_memory()
+
+    for idx, layer in enumerate(model.layers):
+        key = f"model.layers.{idx}.self_attn.R2"
+        R2 = optimized_rotation[key].to(torch.float32)
+        rotate_attention_inputs(layer, R1)
+        rotate_attention_output(layer, R1)
+        rotate_mlp_input(layer, R1)
+        rotate_mlp_output(layer, R1)
+        rotate_ov_proj(layer, head_dim, R2=R2)
+    return model
+
+
+def fuse_ln_linear(
+    layernorm: torch.nn.Module, linear_layers: typing.Iterable[torch.nn.Linear]
+) -> None:
+    """
+    fuse the linear operations in Layernorm into the adjacent linear blocks.
+    """
+    for linear in linear_layers:
+        linear_dtype = linear.weight.dtype
+
+        # Calculating new weight and bias
+        W_ = linear.weight.data.to(dtype=torch.float32)
+        linear.weight.data = (W_ * layernorm.weight.to(dtype=torch.float32)).to(
+            linear_dtype
+        )
+
+        if hasattr(layernorm, "bias"):
+            if linear.bias is None:
+                linear.bias = torch.nn.Parameter(
+                    torch.zeros(linear.out_features, dtype=torch.float32)
+                )
+            linear.bias.data = linear.bias.data.to(dtype=torch.float32) + torch.matmul(
+                W_, layernorm.bias.to(dtype=torch.float32)
+            )
+            linear.bias.data = linear.bias.data.to(linear_dtype)
+
+
+def fuse_layer_norms(model: torch.nn.Module):
+    # Embedding fusion
+    for W in [model.tok_embeddings]:
+        W_ = W.weight.data.to(dtype=torch.float32)
+        W.weight.data = (W_ - W_.mean(dim=-1, keepdim=True)).to(W.weight.data.dtype)
+
+    # Fuse the linear operations in Layernorm into the adjacent linear blocks.
+    for layer in model.layers:
+        # fuse the input layernorms into the linear layers
+        fuse_ln_linear(layer.ffn_norm, [layer.feed_forward.w3, layer.feed_forward.w1])
+        fuse_ln_linear(
+            layer.attention_norm,
+            [
+                layer.attention.wq,
+                layer.attention.wk,
+                layer.attention.wv,
+            ],
+        )
+
+        W_norm = layer.ffn_norm.weight.data
+        layer.ffn_norm.weight.data = torch.ones_like(W_norm, dtype=torch.float32)
+        W_norm = layer.attention_norm.weight.data
+        layer.attention_norm.weight.data = torch.ones_like(W_norm, dtype=torch.float32)
+
+    fuse_ln_linear(
+        model.norm,
+        [model.output],
+    )
+    W_norm = model.norm.weight.data
+    model.norm.weight.data = torch.ones_like(W_norm, dtype=torch.float32)
+
+    return model
diff --git a/examples/models/llama2/source_transformation/prune_output.py b/examples/models/llama2/source_transformation/prune_output.py
new file mode 100644
index 00000000000..6d02d52fa5c
--- /dev/null
+++ b/examples/models/llama2/source_transformation/prune_output.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import numpy as np
+
+import torch
+
+
+def prune_output_vocab(
+    model: torch.nn.Module,
+    token_map: Dict[int, int],
+    output_layer_name: str = "output",
+) -> torch.nn.Module:
+    """Prune the model output linear layer while keeping the tokens in the token map.
+
+    Note: Pruning is performed in-place.
+
+    Args:
+        model: The model to prune.
+        token_map: A dictionary mapping from new token ids to the old token ids to preserve.
+            e.g. {0: 221, 1: 1325, 2: 1542, 3: 1728, 4: 18243}
+        output_layer_name: name of the output layer to prune
+
+    Returns:
+        The pruned model.
+    """
+    assert hasattr(
+        model, output_layer_name
+    ), f"Model does not have {output_layer_name} layer"
+    output_layer = getattr(model, output_layer_name)
+    assert isinstance(
+        output_layer, torch.nn.Linear
+    ), "Output layer is not a linear layer"
+    original_shape = output_layer.weight.shape
+    input_features = original_shape[1]
+    num_pruned_tokens = len(token_map)
+    has_bias = output_layer.bias is not None
+    weight_dtype = output_layer.weight.dtype
+    pruned_layer = torch.nn.Linear(input_features, num_pruned_tokens, bias=has_bias)
+    pruned_layer.to(dtype=weight_dtype)
+    pruned_layer_weights = np.zeros(pruned_layer.weight.shape, dtype=np.float32)
+    pruned_layer_bias = None
+    if has_bias:
+        pruned_layer_bias = np.zeros(pruned_layer.bias.shape, dtype=np.float32)
+    for i, token_id in token_map.items():
+        # Copy the weights and biases from the original layer to the pruned layer
+        pruned_wt = output_layer.weight[token_id].detach()
+        if weight_dtype == torch.bfloat16:
+            pruned_wt = pruned_wt.float()
+        pruned_layer_weights[i] = pruned_wt.numpy()
+        if has_bias:
+            pruned_bias = output_layer.bias[token_id].detach()
+            if weight_dtype == torch.bfloat16:
+                pruned_bias = pruned_bias.float()
+            pruned_layer_bias[i] = pruned_bias.numpy()
+    with torch.no_grad():
+        pruned_layer.weight.copy_(
+            torch.tensor(pruned_layer_weights, dtype=weight_dtype)
+        )
+        if has_bias:
+            pruned_layer.bias.copy_(torch.tensor(pruned_layer_bias, dtype=weight_dtype))
+
+    # Replace the original layer with the pruned layer
+    setattr(model, output_layer_name, pruned_layer)
+
+    return model
diff --git a/examples/models/llama2/source_transformation/quantize.py b/examples/models/llama2/source_transformation/quantize.py
index bb014145bd8..da832f8285a 100644
--- a/examples/models/llama2/source_transformation/quantize.py
+++ b/examples/models/llama2/source_transformation/quantize.py
@@ -96,7 +96,7 @@ def quantize(
 
         try:
             # torchao 0.3+
-            from torchao._eval import InputRecorder
+            from torchao._eval import InputRecorder  # pyre-fixme[21]
         except ImportError:
             from torchao.quantization.GPTQ import InputRecorder  # pyre-ignore
 
@@ -110,7 +110,7 @@ def quantize(
         )
 
         inputs = (
-            InputRecorder(
+            InputRecorder(  # pyre-fixme[16]
                 tokenizer,
                 calibration_seq_length,
                 None,  # input_prep_func
@@ -399,6 +399,7 @@ def replace_embedding_weight_only_grouped_int8_per_channel(
                     vocab_size=child.weight.shape[0],
                     embedding_dim=child.weight.shape[1],
                     group_size=group_size,
+                    dtype=child.weight.dtype,
                     packed=packed,
                 ),
             )
diff --git a/examples/models/llama2/source_transformation/rms_norm.py b/examples/models/llama2/source_transformation/rms_norm.py
new file mode 100644
index 00000000000..ff7e8b67457
--- /dev/null
+++ b/examples/models/llama2/source_transformation/rms_norm.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.examples.models.llama2.llama_transformer import RMSNorm
+
+
+def replace_rms_norm_with_native_rms_norm(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, RMSNorm):
+            rms_norm = torch.nn.RMSNorm(child.dim, eps=child.eps)
+            rms_norm.weight = child.weight
+            setattr(
+                module,
+                name,
+                rms_norm,
+            )
+        else:
+            replace_rms_norm_with_native_rms_norm(child)
+    return module
diff --git a/examples/models/llama2/source_transformation/sdpa.py b/examples/models/llama2/source_transformation/sdpa.py
index 8e5de7d97ae..83f02623892 100644
--- a/examples/models/llama2/source_transformation/sdpa.py
+++ b/examples/models/llama2/source_transformation/sdpa.py
@@ -23,7 +23,9 @@ def __init__(
         dim: int,
     ):
         super().__init__()
-        self.kv_cache = kv_cache
+        # Custom op only supports float32 currently. Converting to/from float32 is
+        # faster than not having the op.
+        self.kv_cache = kv_cache.to(torch.float)
         self.dim = dim
 
     def forward(
@@ -36,6 +38,12 @@ def forward(
         seqlen,
         mask,
     ):
+        # Custom op only supports float32 currently. Converting to/from float32 is
+        # faster than not having the op.
+        input_dtype = q.dtype
+        q = q.to(dtype=torch.float)
+        k = k.to(dtype=torch.float)
+        v = v.to(dtype=torch.float)
         output = torch.ops.llama.sdpa_with_kv_cache(
             q,
             k,
@@ -48,7 +56,7 @@ def forward(
             0,  # dropout probability. Ignored by the code
             True,  # is_causal
         )
-        return output.view(bsz, seqlen, self.dim)
+        return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)
 
 
 def _replace_sdpa_with_custom_op(module: torch.nn.Module):
@@ -118,8 +126,9 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
     num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
     """
-    if n_rep == 1:
-        return hidden_states
+    # TODO: Encounter the bug about source partition, need to investigate more on it.
+    # if n_rep == 1:
+    #     return hidden_states
 
     new_kv = []
     batch, n_heads, seqlen, head_dim = hidden_states.shape
@@ -194,6 +203,136 @@ def replace_sdpa_with_flex_sdpa(module: torch.nn.Module):
     return module
 
 
+@torch.library.custom_op("coreml::sdpa", mutates_args=())
+def sdpa(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, attn_mask: torch.Tensor
+) -> torch.Tensor:
+    """Same as F.scaled_dot_product_attention, but with custom op to avoid lowering during dialect conversion."""
+    return torch.ops.aten.scaled_dot_product_attention.default(
+        q, k, v, attn_mask=attn_mask
+    )
+
+
+@torch.library.register_fake("coreml::sdpa")
+def _(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, attn_mask: torch.Tensor
+) -> torch.Tensor:
+    """Fake implementation with the right output shape, which is required for torch.compile/export/fx tracing."""
+    expected_shape = list(q.shape)
+    expected_shape[-1] = v.shape[-1]
+    return q.new_empty(expected_shape)
+
+
+class SDPACoreML(torch.nn.Module):
+    """Similar to SDPASimple, but with coreml custom op to do SDPA calculation."""
+
+    def __init__(
+        self,
+        kv_cache: KVCache,
+        dim: int,
+        head_dim: int,
+        n_rep: int,
+    ):
+        super().__init__()
+        self.kv_cache = kv_cache
+        self.dim = dim
+        self.head_dim = head_dim
+        self.n_rep = n_rep
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz,
+        seqlen,
+        mask,
+    ):
+        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        k, v = self.kv_cache.update(input_pos, k, v)
+        attn_mask = mask[None, None, input_pos]
+
+        if self.n_rep > 1:
+            k = k.repeat_interleave(self.n_rep, dim=1)
+            v = v.repeat_interleave(self.n_rep, dim=1)
+
+        y = torch.ops.coreml.sdpa(q, k, v, attn_mask)
+
+        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+
+def replace_sdpa_with_coreml_sdpa(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, SDPA):
+            setattr(
+                module,
+                name,
+                SDPACoreML(child.kv_cache, child.dim, child.head_dim, child.n_rep),
+            )
+        else:
+            replace_sdpa_with_coreml_sdpa(child)
+    return module
+
+
+class KVCacheCoreML(torch.nn.Module):
+    """
+    Rather than k_out[:, :, input_pos] = k_val, use torch.ops.aten.index_put_,
+    which can directly translate to CoreML iOS18.silce_update
+    """
+
+    def __init__(
+        self,
+        max_batch_size: int,
+        max_seq_length: int,
+        n_heads: int,
+        head_dim: int,
+        dtype=torch.float32,
+    ):
+        super().__init__()
+        self.max_seq_length = max_seq_length
+        cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
+
+        self.max_batch_size = max_batch_size
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.register_buffer(
+            "k_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
+        )
+        self.register_buffer(
+            "v_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
+        )
+
+    def update(
+        self, input_pos: torch.Tensor, k_val: torch.Tensor, v_val: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        k_out = torch.ops.aten.index_put_(self.k_cache, [None, None, input_pos], k_val)
+        v_out = torch.ops.aten.index_put_(self.v_cache, [None, None, input_pos], v_val)
+        return k_out, v_out
+
+
+def replace_kv_cache_with_coreml_kv_cache(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, KVCache):
+            setattr(
+                module,
+                name,
+                KVCacheCoreML(
+                    child.max_batch_size,
+                    child.max_seq_length,
+                    child.n_heads,
+                    child.head_dim,
+                    child.k_cache.dtype,
+                ),
+            )
+        else:
+            replace_kv_cache_with_coreml_kv_cache(child)
+    return module
+
+
 class KVCacheSimple(torch.nn.Module):
     def __init__(
         self,
diff --git a/examples/models/llama2/source_transformation/spin_quant.py b/examples/models/llama2/source_transformation/spin_quant.py
new file mode 100644
index 00000000000..1dbf878dc61
--- /dev/null
+++ b/examples/models/llama2/source_transformation/spin_quant.py
@@ -0,0 +1,180 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+# Helper functions for tranforming the model to be able to run SpinQuant.
+# See https://github.com/facebookresearch/SpinQuant for more details about SpinQuant.
+
+from typing import Any
+
+import torch
+
+import torch.nn.functional as F
+
+from executorch.examples.models.llama2.llama_transformer import FeedForward
+from torch import nn
+from torchao.quantization.GPTQ import _check_linear_int4_k, Int8DynActInt4WeightLinear
+from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter
+
+
+def _inject_fast_hadamard_transform_cuda_for_spin_quant(module: torch.nn.Module):
+    """
+    SpinQuant needs two Hadmard matrixes: R3 and R4. Here we are only injecting R4 in the feed forward layer.
+    R3 needs to be injected as well when KV cache quantization is enabled.
+    """
+    try:
+        from fast_hadamard_transform import hadamard_transform
+    except ImportError:
+        raise ImportError(
+            "Please install fast-hadamard-transform: pip install fast-hadamard-transform"
+        )
+
+    class FeedForwardCudaCustom(nn.Module):
+        def __init__(self, w1, w2, w3):
+            super().__init__()
+            self.w1 = w1
+            self.w2 = w2
+            self.w3 = w3
+
+        def forward(self, x):
+            w = F.silu(self.w1(x)) * self.w3(x)
+            n = w.shape[-1]
+            return self.w2(hadamard_transform(w.contiguous()) / torch.tensor(n).sqrt())
+
+    for name, child in module.named_children():
+        if isinstance(child, FeedForward):
+            setattr(module, name, FeedForwardCudaCustom(child.w1, child.w2, child.w3))
+        else:
+            _inject_fast_hadamard_transform_cuda_for_spin_quant(child)
+
+
+def inject_fast_hadamard_transform_cuda_for_spin_quant(
+    module: torch.nn.Module,
+) -> torch.nn.Module:
+    _inject_fast_hadamard_transform_cuda_for_spin_quant(module)
+    return module
+
+
+def _inject_fast_hadamard_transform_native_for_spin_quant(module: torch.nn.Module):
+    """
+    SpinQuant needs two Hadmard matrixes: R3 and R4. Here we are only injecting R4 in the feed forward layer.
+    R3 needs to be injected as well when KV cache quantization is enabled.
+    """
+
+    class FeedForwardNativeCustom(nn.Module):
+        def __init__(self, w1, w2, w3):
+            super().__init__()
+            self.w1 = w1
+            self.w2 = w2
+            self.w3 = w3
+
+        def forward(self, x):
+            return self.w2(
+                torch.ops.llama.fast_hadamard_transform(F.silu(self.w1(x)) * self.w3(x))
+            )
+
+    for name, child in module.named_children():
+        if isinstance(child, FeedForward):
+            setattr(module, name, FeedForwardNativeCustom(child.w1, child.w2, child.w3))
+        else:
+            _inject_fast_hadamard_transform_native_for_spin_quant(child)
+
+
+def inject_fast_hadamard_transform_native_for_spin_quant(
+    module: torch.nn.Module,
+) -> torch.nn.Module:
+    _inject_fast_hadamard_transform_native_for_spin_quant(module)
+    return module
+
+
+def _replace_linear_with_linear_8da4w_for_spin_quant(
+    module: torch.nn.Module,
+    checkpoint: Any,
+    group_size: int,
+    precision: torch.dtype,
+    scales_precision: torch.dtype,
+):
+    def filter_fn(child: torch.nn.Module, cur_fqn: str) -> bool:
+        # Only replace linear layers where the checkpoint contains explicit scales
+        scales_key = f"{cur_fqn}.scale"
+        if isinstance(child, nn.Linear) and scales_key in checkpoint:
+            assert _check_linear_int4_k(child.in_features, group_size)
+            assert checkpoint[f"{cur_fqn}.weight"].dtype == torch.int8
+            assert checkpoint[scales_key].dtype == scales_precision
+            return True
+        return False
+
+    def replacement_fn(child: torch.nn.Module) -> torch.nn.Module:
+        new_linear = Int8DynActInt4WeightLinear(
+            child.in_features,
+            child.out_features,
+            bias=False,
+            device=child.weight.device,
+            groupsize=group_size,
+            precision=precision,
+            scales_precision=scales_precision,
+        )
+        return new_linear
+
+    _replace_with_custom_fn_if_matches_filter(module, replacement_fn, filter_fn)
+
+
+def transform_for_spinquant(
+    module: torch.nn.Module,
+    checkpoint: Any,
+    group_size: int,
+    quantization_mode: str,
+    dtype: torch.dtype,
+) -> torch.nn.Module:
+    """
+    Transform the model to be able to load SpinQuant checkpoints that
+    are quantized with the given group size and quantization mode.
+    """
+
+    if group_size not in [32, 64, 128, 256]:
+        raise ValueError(f"Group size {group_size} is not supported for SpinQuant.")
+    if quantization_mode not in ["8da4w"]:
+        raise ValueError(
+            f"Quantization mode {quantization_mode} is not compatible with SpinQuant."
+        )
+    _replace_linear_with_linear_8da4w_for_spin_quant(
+        module,
+        checkpoint,
+        group_size,
+        dtype,
+        dtype,
+    )
+    return module
+
+
+def sanitize_checkpoint_from_spinquant(
+    checkpoint: Any,
+    group_size: int,
+):
+    """
+    Sanitize the SpinQuant checkpoint.
+        - Renames 'scale' to 'scales'
+        - Groups scales
+        - Removes 'o_weight'
+        - Converts all tensors to contiguous format
+    """
+    keys_to_rename = []
+    keys_to_remove = []
+    for k, _ in checkpoint.items():
+        if k.endswith(".scale"):
+            new_key = k + "s"
+            keys_to_rename.append((k, new_key))
+        if k.endswith(".o_weight"):
+            keys_to_remove.append(k)
+
+    for old_key, new_key in keys_to_rename:
+        old_val = checkpoint.pop(old_key)
+        checkpoint[new_key] = old_val if group_size == -1 else old_val[:, ::group_size]
+    for k in keys_to_remove:
+        checkpoint.pop(k)
+    for k, v in checkpoint.items():
+        checkpoint[k] = v.contiguous()
diff --git a/examples/models/llama2/targets.bzl b/examples/models/llama2/targets.bzl
index 6cf398097d0..57e84256a49 100644
--- a/examples/models/llama2/targets.bzl
+++ b/examples/models/llama2/targets.bzl
@@ -17,8 +17,8 @@ def define_common_targets():
                 deps = [
                     "//executorch/examples/models/llama2/runner:runner" + aten_suffix,
                     "//executorch/extension/evalue_util:print_evalue",
-                    "//executorch/backends/xnnpack/threadpool:threadpool",
-                    "//executorch/backends/xnnpack/threadpool:cpuinfo_utils",
+                    "//executorch/extension/threadpool:threadpool",
+                    "//executorch/extension/threadpool:cpuinfo_utils",
                 ],
                 external_deps = [
                     "gflags",
diff --git a/examples/models/llama2/tests/TARGETS b/examples/models/llama2/tests/TARGETS
index 3d2aef6209f..76981d8f317 100644
--- a/examples/models/llama2/tests/TARGETS
+++ b/examples/models/llama2/tests/TARGETS
@@ -13,3 +13,16 @@ python_unittest(
         "//executorch/examples/models/llama2:llama_transformer",
     ],
 )
+
+python_unittest(
+    name = "test_spinquant_transforms",
+    srcs = [
+        "test_spinquant_transforms.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama2:export_library",
+        "//executorch/examples/models/llama2:llama_transformer",
+        "//pytorch/ao:torchao",
+    ],
+)
diff --git a/examples/models/llama2/tests/test_spinquant_transforms.py b/examples/models/llama2/tests/test_spinquant_transforms.py
new file mode 100644
index 00000000000..bd56632c5f5
--- /dev/null
+++ b/examples/models/llama2/tests/test_spinquant_transforms.py
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer
+from executorch.examples.models.llama2.source_transformation.spin_quant import (
+    sanitize_checkpoint_from_spinquant,
+    transform_for_spinquant,
+)
+from torchao.quantization.utils import group_quantize_tensor_symmetric
+
+
+class SpinQuantTests(unittest.TestCase):
+    def test_transforms_for_spinquant(self):
+
+        # Step 1: Create llama class with dummy weights
+        params = {
+            "dim": 768,
+            "multiple_of": 32,
+            "n_heads": 12,
+            "n_layers": 12,
+            "norm_eps": 1e-05,
+            "vocab_size": 32000,
+        }
+
+        model_args = ModelArgs(
+            max_seq_len=2048,
+            max_batch_size=1,
+            use_kv_cache=False,
+            use_sdpa_with_kv_cache_op=False,
+            generate_full_logits=False,
+            enable_dynamic_shape=True,
+            **params,
+        )
+
+        model = Transformer(model_args)
+        checkpoint = model.state_dict()
+
+        # Step 2:
+        # Do group-wise quantization and amend the checkpoints with
+        # int8 weight and fp32 scales
+        group_size = 32
+        n_bit = 4
+        scales_precision = torch.float32
+        for fqn, mod in model.named_modules():
+            # Quantize everything except the last layer
+            if isinstance(mod, torch.nn.Linear) and ("output" not in fqn):
+                weight = mod.weight.data
+                (
+                    weight_int8,
+                    scales,
+                    zeros,
+                ) = group_quantize_tensor_symmetric(
+                    weight.to(torch.float32), n_bit, group_size, scales_precision
+                )
+                checkpoint[f"{fqn}.weight"] = weight_int8.to("cpu")
+                checkpoint[f"{fqn}.scale"] = scales.to("cpu")
+
+        # Step 3:
+        # Transform the model so that it is compatible with the new checkpoint
+        transform_for_spinquant(
+            model,
+            checkpoint,
+            32,
+            "8da4w",
+            torch.float32,
+        )
+        sanitize_checkpoint_from_spinquant(
+            checkpoint,
+            -1,
+        )
+
+        model.load_state_dict(
+            checkpoint,
+            strict=False,
+            assign=True,
+        )
+
+        new_checkpoint = model.state_dict()
+
+        for k, v in checkpoint.items():
+            # The new_checkpoint contains zeros so
+            # have to iterate over the keys.
+            self.assertTrue(torch.allclose(new_checkpoint[k], v))
diff --git a/examples/models/llama2/tokenizer/llama_tiktoken.cpp b/examples/models/llama2/tokenizer/llama_tiktoken.cpp
index 60620a9b333..5ce9d7f14cc 100644
--- a/examples/models/llama2/tokenizer/llama_tiktoken.cpp
+++ b/examples/models/llama2/tokenizer/llama_tiktoken.cpp
@@ -8,8 +8,10 @@
 
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
 
-namespace torch {
-namespace executor {
+namespace example {
+
+using ::executorch::extension::llm::Tiktoken;
+
 namespace {
 static constexpr int32_t kSpecialTokensSize = 256;
 static constexpr size_t kBOSTokenIndex = 0;
@@ -23,15 +25,15 @@ _get_default_special_tokens() {
           "<|end_of_text|>",
           "<|reserved_special_token_0|>",
           "<|reserved_special_token_1|>",
-          "<|reserved_special_token_2|>",
-          "<|reserved_special_token_3|>",
+          "<|finetune_right_pad_id|>",
+          "<|step_id|>",
           "<|start_header_id|>",
           "<|end_header_id|>",
-          "<|reserved_special_token_4|>",
-          "<|eot_id|>"});
-
+          "<|eom_id|>",
+          "<|eot_id|>",
+          "<|python_tag|>"});
   // pad the rest of the special tokens with reserved tokens
-  ssize_t reserved_special_token_num = 5;
+  ssize_t reserved_special_token_num = 2;
   while (special_tokens->size() < kSpecialTokensSize) {
     special_tokens->emplace_back(
         "<|reserved_special_token_" +
@@ -72,7 +74,7 @@ _get_multimodal_special_tokens() {
 
 std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
   switch (version) {
-    case MULTIMODAL:
+    case Version::Multimodal:
       return _get_multimodal_special_tokens();
     default:
       return _get_default_special_tokens();
@@ -86,5 +88,4 @@ std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
       _get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/models/llama2/tokenizer/llama_tiktoken.h b/examples/models/llama2/tokenizer/llama_tiktoken.h
index 5e05b946d16..6baa3f49cc6 100644
--- a/examples/models/llama2/tokenizer/llama_tiktoken.h
+++ b/examples/models/llama2/tokenizer/llama_tiktoken.h
@@ -10,15 +10,14 @@
 
 #include <executorch/extension/llm/tokenizer/tiktoken.h>
 
-namespace torch {
-namespace executor {
+namespace example {
 
-enum Version {
-  DEFAULT,
-  MULTIMODAL,
+enum class Version {
+  Default,
+  Multimodal,
 };
 
-std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version = DEFAULT);
+std::unique_ptr<::executorch::extension::llm::Tiktoken> get_tiktoken_for_llama(
+    Version version = Version::Default);
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/models/llama2/tokenizer/targets.bzl b/examples/models/llama2/tokenizer/targets.bzl
index 70318740d6a..40f8f29ac1e 100644
--- a/examples/models/llama2/tokenizer/targets.bzl
+++ b/examples/models/llama2/tokenizer/targets.bzl
@@ -21,3 +21,19 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
     )
+
+    runtime.python_library(
+        name = "tiktoken_py",
+        srcs = [
+            "tiktoken.py",
+        ],
+        _is_external_target = True,
+        visibility = [
+            "//bento/...",
+            "//bento_kernels/...",
+            "//executorch/...",
+        ],
+        deps = [
+            "fbsource//third-party/pypi/tiktoken:tiktoken",
+        ],
+    )
diff --git a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp b/examples/models/llama2/tokenizer/test/test_tiktoken.cpp
index 0bc1e7d9dc1..5bd6515b676 100644
--- a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp
+++ b/examples/models/llama2/tokenizer/test/test_tiktoken.cpp
@@ -7,20 +7,25 @@
  */
 
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
+
+#include <vector>
+
 #include <executorch/runtime/platform/runtime.h>
+
 #include <gtest/gtest.h>
-#include <vector>
 
 using namespace ::testing;
 
-namespace torch {
-namespace executor {
+using ::example::Version;
+using ::executorch::extension::llm::Tokenizer;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
 
 class MultimodalTiktokenV5ExtensionTest : public Test {
  public:
   void SetUp() override {
-    torch::executor::runtime_init();
-    tokenizer_ = get_tiktoken_for_llama(MULTIMODAL);
+    executorch::runtime::runtime_init();
+    tokenizer_ = get_tiktoken_for_llama(Version::Multimodal);
     modelPath_ = std::getenv("RESOURCES_PATH") +
         std::string("/test_tiktoken_tokenizer.model");
   }
@@ -79,5 +84,3 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
     EXPECT_EQ(out.get(), expected[i]);
   }
 }
-} // namespace executor
-} // namespace torch
diff --git a/examples/models/llama2/tokenizer/tiktoken.py b/examples/models/llama2/tokenizer/tiktoken.py
index a1f0fde11af..d12b4eb33d2 100644
--- a/examples/models/llama2/tokenizer/tiktoken.py
+++ b/examples/models/llama2/tokenizer/tiktoken.py
@@ -116,8 +116,8 @@ def encode(
             s (str): The input string to be encoded.
             bos (bool): Whether to prepend the beginning-of-sequence token.
             eos (bool): Whether to append the end-of-sequence token.
-            allowed_tokens ("all"|set[str]): allowed special tokens in string
-            disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string
+            allowed_special ("all"|set[str]): allowed special tokens in string
+            disallowed_special ("all"|set[str]): special tokens that raise an error when in string
 
         Returns:
             list[int]: A list of token IDs.
@@ -125,7 +125,7 @@ def encode(
         By default, setting disallowed_special=() encodes a string by ignoring
         special tokens. Specifically:
         - Setting `disallowed_special` to () will cause all text corresponding
-          to special tokens to be encoded as natural text (insteading of raising
+          to special tokens to be encoded as natural text (instead of raising
           an error).
         - Setting `allowed_special` to "all" will treat all text corresponding
           to special tokens to be encoded as special tokens.
diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
index 9bb9d9cf9d6..c36e39a04cb 100644
--- a/examples/models/llava/CMakeLists.txt
+++ b/examples/models/llava/CMakeLists.txt
@@ -21,6 +21,8 @@ project(llava)
 # Duplicating options as root CMakeLists.txt
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)
 
+# This is a temporary hack to get around Torch dep so we can test this on android
+option(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE "Hack option to feed dummy image to remove torch.load dep" OFF)
 
 include(CMakeDependentOption)
 #
@@ -71,7 +73,14 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
 find_package(gflags REQUIRED)
 
-find_package(Torch CONFIG REQUIRED)
+# Avoid torch dep from torch.load()-ing the image.
+# This is a temporary hack.
+if(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
+  add_definitions(-DLLAVA_NO_TORCH_DUMMY_IMAGE=1)
+  message("Buidling the runner without Torch, feeding a dummy image!")
+else()
+  find_package(Torch CONFIG REQUIRED)
+endif()
 add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
 
 #
@@ -96,7 +105,11 @@ endif()
 # llava_runner library
 add_subdirectory(runner)
 
-set(link_libraries gflags torch)
+set(LINK_LIBS gflags)
+if(NOT LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
+  list(APPEND LINK_LIBS torch)
+endif()
+set(link_libraries ${LINK_LIBS})
 set(_srcs main.cpp)
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
@@ -128,13 +141,7 @@ set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
 # Extra compile option and include dir for pthreadpool
 if(EXECUTORCH_BUILD_PTHREADPOOL)
   list(APPEND _common_compile_options -DET_USE_THREADPOOL)
-  list(APPEND link_libraries pthreadpool)
-  # These 2 source files are included in xnnpack_backend
-  if(NOT TARGET xnnpack_backend)
-    list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/threadpool.cpp
-         ${XNNPACK_ROOT}/threadpool/threadpool_guard.cpp
-    )
-  endif()
+  list(APPEND link_libraries extension_threadpool pthreadpool)
   list(APPEND _common_include_directories
        ${XNNPACK_ROOT}/third-party/pthreadpool/include
   )
@@ -142,8 +149,7 @@ endif()
 
 # Extra sources for cpuinfo
 if(EXECUTORCH_BUILD_CPUINFO)
-  list(APPEND link_libraries cpuinfo)
-  list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/cpuinfo_utils.cpp)
+  list(APPEND link_libraries extension_threadpool cpuinfo)
   list(APPEND _common_include_directories
        ${XNNPACK_ROOT}/third-party/cpuinfo/include
   )
@@ -203,7 +209,11 @@ endif()
 
 add_executable(llava_main ${_srcs})
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
-  target_link_options(llava_main PRIVATE "LINKER:--gc-sections,-s")
+  if(APPLE)
+    target_link_options(llava_main PRIVATE "LINKER:-dead_strip,-s")
+  else()
+    target_link_options(llava_main PRIVATE "LINKER:--gc-sections,-s")
+  endif()
 endif()
 
 target_include_directories(llava_main PUBLIC ${_common_include_directories})
diff --git a/examples/models/llava/README.md b/examples/models/llava/README.md
index 807e1b3ceef..ad2f3f3dc99 100644
--- a/examples/models/llava/README.md
+++ b/examples/models/llava/README.md
@@ -1,88 +1,196 @@
 ## Summary
 LLaVA is the first multi-modal LLM ExecuTorch supports. In this directory, we
-- Host a model definition for LLaVA.
-- Demonstrate how to export [LLavA](https://github.com/haotian-liu/LLaVA) multimodal model to a .pte file.
-- Provide a C++ runner that loads the .pte file, the tokenizer and an image, then generate responses based on user prompt.
+- Host a model definition for [LLavA](https://github.com/haotian-liu/LLaVA).
+- Demonstrate how to export LLavA multimodal model to generate ExecuTorch .PTE file.
+- Provide a C++ runner, Android/iOS Apps that loads the .pte file, the tokenizer and an image, then generate responses based on user prompt.
+- Discuss optimizations went into enabling LlaVA on a phone, and early performance numbers
+
+Tokenizer, image encoder, and the pretrained text model, which is based on Meta
+[Llama2-7b](https://llama.meta.com/llama2/), is loaded from Llava
+huggingface page [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf) .
+
+
+<p align="center">
+      <img src="./llava_via_xnnpack.gif" width=300>
+      <br>
+      <em>
+      Running Llava1.5 7B on Android phone
+      </em>
+</p>
+
+## What is LLaVA?
+
+[LLaVA](https://llava-vl.github.io/) is a novel end-to-end trained large
+multimodal model that combines a vision encoder and Vicuna (a LLama2 based text
+model) for general-purpose visual and language understanding, achieving
+impressive chat capabilities mimicking spirits of the cutting edge multimodal
+models and setting a high bar for accuracy on Science QA.
 
 ## Instructions
-### Export .pte & other artifacts
 
-Run the following command to generate `llava.pte`, `tokenizer.bin` and an image tensor (serialized in TorchScript) `image.pt`.
+First you need to generate a .PTE file for the model, along with input image,
+and other artifacts. Then you need either a C++ runner, or Android or iOS
+application to test things out on device.
+
+### Generate ExecuTorch .PTE and other artifacts
 
-Prerequisite: run `install_requirements.sh` to install ExecuTorch and run `examples/models/llava/install_requirements.sh` to install dependencies.
+Run the following command to generate `llava.pte`, `tokenizer.bin` and an image
+tensor (serialized in TorchScript) `image.pt`.
+
+Prerequisite: run `install_requirements.sh` to install ExecuTorch and run
+`examples/models/llava/install_requirements.sh` to install dependencies.
 
 ```bash
 python -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
 ```
 
-Currently the whole export process takes about 6 minutes. We also provide a small test util to verify the correctness of the exported .pte file. Just run:
+Currently the whole export process takes about 6 minutes. We also provide a
+small test utility to verify the correctness of the exported .pte file. Just run:
 
 ```bash
 python -m executorch.examples.models.llava.test.test_pte llava.pte
 ```
 
-If everything works correctly it should give you some meaningful response such as:
-
+### Build C++ Runner
 
+See or run `.ci/scripts/test_llava.sh` shell script to build a C++ runner. This
+script also has preliminary support to build the C++ runner for Android.
 
-### Build C++ runner
+This also has an image utility Python script to generate image in PyTorch
+loadable format. Alternatively, we are working on generating image format which
+doesn't need PyTorch to load an image. Motivation for this is to build the C++
+runner on Android.
 
-Run the following cmake commands from `executorch/`:
+Then you should be able to find `llava_main` binary:
 
 ```bash
-# build libraries
-cmake                                               \
-    -DCMAKE_INSTALL_PREFIX=cmake-out                \
-    -DCMAKE_BUILD_TYPE=Debug                        \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON          \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON     \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON            \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON         \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON         \
-    -DEXECUTORCH_BUILD_XNNPACK=ON                   \
-    -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON            \
-    -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON        \
-    -Bcmake-out .
-
-
-cmake --build cmake-out -j9 --target install --config Debug
-
-# build llava runner
-
-dir=examples/models/llava
-python_lib=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
-
-cmake                                       \
-    -DCMAKE_INSTALL_PREFIX=cmake-out        \
-    -DCMAKE_BUILD_TYPE=Debug                \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON    \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=ON           \
-    -DCMAKE_PREFIX_PATH="$python_lib"       \
-    -Bcmake-out/${dir}                      \
-    ${dir}
-
-
-cmake --build cmake-out/${dir} -j9 --config Debug
+cmake-out/examples/models/llava/llava_main
 ```
 
-Or simply run `.ci/scripts/test_llava.sh`.
+### Build Mobile Apps
 
-Then you should be able to find `llava_main` binary:
+#### Android
 
-```bash
-cmake-out/examples/models/llava/llava_main
-```
+We can run LLAVA using the LLAMA Demo Apps. Please refer to [this
+tutorial](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/LlamaDemo)
+to for full instructions on building the Android LLAMA Demo App.
+
+#### iOS
 
-### Run LLaVA
+We can run LLAVA using the LLAMA Demo Apps. Please refer to [this
+tutorial](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/apple_ios/LLaMA)
+to for full instructions on building the iOS LLAMA Demo App.
+
+### Running LLaVA
 
 Run:
 ```bash
-cmake-out/examples/models/llava/llava_main --model_path=llava.pte --tokenizer_path=tokenizer.bin --image_path=image.pt --prompt="What are the things I should be cautious about when I visit here? ASSISTANT:" --seq_len=768 --temperature=0
+cmake-out/examples/models/llava/llava_main \
+    --model_path=llava.pte                 \
+    --tokenizer_path=tokenizer.bin         \
+    --image_path=image.pt                  \
+    --prompt="ASSISTANT:" \
+    --seq_len=768                          \
+    --temperature=0
 ```
+(see --help for other options).
+
+For this example input used in this example,
+
+![image](https://upload.wikimedia.org/wikipedia/commons/3/3e/Chicago_Bulls_-_New_Jersey_Nets_match_on_March_28%2C_1991.jpg)
 
-You should get a response like:
+You should get a response like (tested on Arm CPUs with ET XNNPACK delegate):
 
 ```
-When visiting a place like this, ...
+ASSISTANT: image captures a basketball game in progress, with several players on the court. ...
 ```
+
+## Optimizations and Results
+
+Since LLaVA model needs at least 4-bit quantization to fit even within some of
+the high-end phones, results presented here correspond to 4-bit groupwise
+post-training quantized model.
+
+In addition to that, work is mainly focused on using Arm CPUs and ET XNNPACK delegate.
+
+### Memory Footprint Reduction Techniques
+
+With Llava, we needed to find a way to reduce the memory footprint in order to
+make it feasible to run on edge devices. Out of the box, even with 4-bit
+quantized weights, the memory footprint is around ~11 GiB, which is
+prohibitively large even for high-end Android or iOS devices.
+
+We did several optimizations, which should be already enabled if you follow this
+tutorial, to get the memory footprint down to ~5 GiB, which unblocks us to run
+on high-end devices.
+
+#### Sharing intermediate memory across delegates
+
+Sharing working memory across ET XNNPACK delegates helps reduce the peak memory
+usage for LLMs with many DQLinears. We reduced it by 36.1% (from 10.44GiB to
+6.67GiB) for Llava towards unblocking it to run on Phones.
+
+#### Reducing maximum sequence length
+
+To free up more memory, we examined non-constant memory usage, specifically
+focusing on intermediate tensors used throughout the model during inference.
+The majority of these were found in the KV-cache allocations. Based on “minimum
+can get away with” heuristic, we reduced max sequence length number to 768 from
+previous default 2048. This adjustment led to a further memory reduction of
+approximately 1.23 GiB (from 6.67 GiB to 5.44 GiB).
+
+#### Quantizing embedding weights to 8b
+
+By quantizing the embedding layer to 8 bit, we were able to achieve an
+additional memory footprint reduction of approximately 300 MiB, bringing the
+total down to ~5 GiB.
+
+### Performance Optimizations
+
+#### Decode performance
+
+This was already heavily optimized through KV-cache and GEMV kernel
+optimization efforts for LLama2/3.
+
+#### Encode performance
+
+With image based large prompts, this was the focus of performance
+optimizations for LLaVA. We implemented two main optimizations to bring the decode or
+prefill performance for the image down by more than 100% from the baseline.
+
+* **Two XNNPACK Partitioners**
+
+For text-only LLMs, our approach involved lowering only DQLinear ops
+to XNNPACK and relying on ExecuTorch-optimized operators or custom ops
+(utilizing Neon SIMD) to support multiplication, addition, and other
+operations. Lowering these operations to XNNPACK significantly improves Time to
+First Token (TTFT).
+
+
+* **New Arm Neon I8mm GEMM kernels**
+
+We introduced new kernels in XNNPACK for the quantization scheme used
+here, which upgrades our existing dot-prod based GEMM kernels to i8mm based
+GEMM kernels. The new kernel offers significantly improved performance by
+leveraging the more efficient SMMLA instruction from Arm Neon. However, it's
+worth noting that this instruction is only available on newer Arm CPUs.
+
+
+### Results
+
+Note this is an active area of development in the ExecuTorch repository. You
+will need this PR [5380](https://github.com/pytorch/executorch/pull/5380) to
+supply an image to the C++ runner on Android without Torch dependency. This
+should be merged soon.
+
+With those caveats out of the way, here are some preliminary numbers (as average of
+three runs) for LLaVA using a C++ runner on Android OnePlus12 device with 12GiB
+memory.
+
+| Experiment Setup  | Prefill time in seconds | Decode tokens/second |
+| :------------- | -------------: | -------------: |
+| Baseline  | 29.95  | 8.75 |
+| + Two XNNPACK Partitioners  | 17.82  | 8.93 |
+| + New Arm Neon i8mm GEMM Kernels  | 14.60 | 8.92 |
+
+We appreciate your feedback. Please let us know if you run into any issues.
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
index 50df2e038f2..47a5407cf18 100644
--- a/examples/models/llava/export_llava.py
+++ b/examples/models/llava/export_llava.py
@@ -17,17 +17,30 @@
     get_quantizer_and_quant_params,
 )
 from executorch.examples.models.llama2.source_transformation.quantize import (
+    EmbeddingQuantHandler,
     get_quant_weight_transform,
 )
 from executorch.examples.models.llama2.source_transformation.sdpa import (
     replace_sdpa_with_custom_op,
 )
+from executorch.examples.models.llava.image_util import serialize_image
 from executorch.examples.models.llava.model import LlavaModel
-from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
+from executorch.exir import (
+    EdgeCompileConfig,
+    ExecutorchBackendConfig,
+    to_edge_transform_and_lower,
+)
+
+from executorch.exir.passes import MemoryPlanningPass
+from executorch.exir.passes.quant_fusion_pass import QuantFusionPass
+from executorch.exir.passes.sym_shape_eval_pass import (
+    ConstraintBasedSymShapeEvalPass,
+    HintBasedSymShapeEvalPass,
+)
 
 from executorch.extension.llm.export.builder import DType, LLMEdgeManager
 from executorch.extension.llm.tokenizer.tokenizer import Tokenizer
-from torch import nn
+from executorch.util.activation_memory_profiler import generate_memory_trace
 from torch.ao.quantization.quantizer.xnnpack_quantizer import (
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
@@ -76,6 +89,7 @@ def forward(self, input_pos, embeddings):
         use_kv_cache=True,
         example_inputs=(torch.tensor([0], dtype=torch.int64), embeddings),
         dynamic_shapes=dynamic_shapes,
+        args=llava.text_model_args,
     )
 
     dtype_override = DType.fp32
@@ -132,6 +146,7 @@ def forward(self, images):
             use_kv_cache=True,
             example_inputs=(resized,),
             dynamic_shapes=dynamic_shapes,
+            args=None,
         )
         .capture_pre_autograd_graph()
         .pt2e_quantize([quantizer])
@@ -148,12 +163,20 @@ def forward(self, images):
 
 
 def export_token_embedding(llava, prompt):
-    embed = llava.embed_tokens
-    token_dim_1 = Dim("token_dim_1", min=2, max=3518)
+    def quant_embedding(model):
+        return EmbeddingQuantHandler(
+            model,
+            bitwidth=8,
+            group_size=32,
+            packed=False,
+        ).quantized_model()
+
+    quantized_token_embed = quant_embedding(llava.model_.language_model.model)
+    token_dim_1 = Dim("token_dim_1", min=2, max=llava.text_model_args.max_seq_len)
     dynamic_shapes = [{1: token_dim_1}]
     with torch.no_grad():
         token_embedding_ep = torch.export.export(
-            embed, (prompt,), dynamic_shapes=dynamic_shapes
+            quantized_token_embed.embed_tokens, (prompt,), dynamic_shapes=dynamic_shapes
         )
     return token_embedding_ep
 
@@ -190,16 +213,38 @@ def export_all(llava_model: LlavaModel):
         partitioner={
             "image_encoder": [XnnpackPartitioner()],
             "text_model": [
+                # First partition the DQLinear nodes, then partition the rest of the nodes,
+                # to avoid multiple DQLinear nodes in the same partition,
+                # to avoid holding multiple unpacked and packed weight buffers in memory,
+                # to reduce peak memory footprint.
                 XnnpackPartitioner(
                     config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
                     per_op_mode=True,
-                )
+                ),
+                XnnpackPartitioner(),
             ],
         },
         compile_config=EdgeCompileConfig(_check_ir_validity=False),
     )
 
-    executorch_program = lowered_and_edge.to_executorch()
+    executorch_program = lowered_and_edge.to_executorch(
+        ExecutorchBackendConfig(
+            extract_delegate_segments=True,
+            passes=[
+                QuantFusionPass(),
+            ],
+            memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
+            sym_shape_eval_pass={
+                "image_encoder": ConstraintBasedSymShapeEvalPass(),
+                "text_model": ConstraintBasedSymShapeEvalPass(),
+                "token_embedding": HintBasedSymShapeEvalPass(),
+            },
+        )
+    )
+    for execution_plan in executorch_program._emitter_output.program.execution_plan:
+        logging.info(
+            f"Required memory for activation in bytes: {execution_plan.non_const_buffer_sizes}"
+        )
     return executorch_program
 
 
@@ -207,14 +252,7 @@ def get_image_tensor_for_llava_runner(llava_model):
     # llava runner doesn't have image reader so an image tensor is needed.
     (resized,) = llava_model.get_example_inputs()
 
-    copy = torch.tensor(resized)
-    m = nn.Module()
-    par = nn.Parameter(copy, requires_grad=False)
-    m.register_parameter("0", par)
-    tensors = torch.jit.script(m)
-    tensors.save("image.pt")
-
-    logging.info("Saved image tensor to image.pt")
+    serialize_image(resized, "image.pt")
 
 
 def get_tokenizer_for_llava_runner(llava_model):
@@ -232,6 +270,12 @@ def main():
         action=BooleanOptionalAction,
         help="Use sdpa_with_kv_cache custom op in LLava text model.",
     )
+    parser.add_argument(
+        "--max-seq-len",
+        default=768,
+        type=int,
+        help="Maximum sequence length for the text model.",
+    )
     parser.add_argument(
         "--pte-name",
         default="llava_combined_xnnpack.pte",
@@ -243,14 +287,32 @@ def main():
         action=BooleanOptionalAction,
         help="Generate artifacts for llava runner.",
     )
+    parser.add_argument(
+        "--profile_memory",
+        required=False,
+        action="store_true",
+        help="Generate chrome trace of activation memory for intermediate tensors.",
+    )
     args = parser.parse_args()
     logging.info(
-        f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {args.use_sdpa_with_kv_cache}"
+        f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {args.use_sdpa_with_kv_cache}, max_seq_len: {args.max_seq_len}"
+    )
+    llava_model = LlavaModel(
+        use_sdpa_with_kv_cache_op=args.use_sdpa_with_kv_cache,
+        max_seq_len=args.max_seq_len,
     )
-    llava_model = LlavaModel(use_sdpa_with_kv_cache_op=args.use_sdpa_with_kv_cache)
 
     executorch_program = export_all(llava_model)
 
+    # memory profiling
+    if args.profile_memory:
+        for method_name in executorch_program.methods:
+            generate_memory_trace(
+                executorch_program,
+                f"{args.pte_name}_{method_name}.json",
+                method_name=method_name,
+            )
+
     with open(args.pte_name, "wb") as f:
         executorch_program.write_to_file(f)
     logging.info(f"Exported ExecuTorch program to {args.pte_name}")
diff --git a/examples/models/llava/image_util.py b/examples/models/llava/image_util.py
new file mode 100644
index 00000000000..bf5e331d61c
--- /dev/null
+++ b/examples/models/llava/image_util.py
@@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Utility functions for image processing. Run it with your image:
+
+# python image_util.py --image-path <path_to_image>
+
+import logging
+from argparse import ArgumentParser
+
+import torch
+import torchvision
+from PIL import Image
+from torch import nn
+
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+
+def prepare_image(image: Image, target_h: int, target_w: int) -> torch.Tensor:
+    """Read image into a tensor and resize the image so that it fits in
+    a target_h x target_w canvas.
+
+    Args:
+        image (Image): An Image object.
+        target_h (int): Target height.
+        target_w (int): Target width.
+
+    Returns:
+        torch.Tensor: resized image tensor.
+    """
+    img = torchvision.transforms.functional.pil_to_tensor(image)
+    # height ratio
+    ratio_h = img.shape[1] / target_h
+    # width ratio
+    ratio_w = img.shape[2] / target_w
+    # resize the image so that it fits in a target_h x target_w canvas
+    ratio = max(ratio_h, ratio_w)
+    output_size = (int(img.shape[1] / ratio), int(img.shape[2] / ratio))
+    img = torchvision.transforms.Resize(size=output_size)(img)
+    return img
+
+
+def serialize_image(image: torch.Tensor, path: str) -> None:
+    copy = torch.tensor(image)
+    m = nn.Module()
+    par = nn.Parameter(copy, requires_grad=False)
+    m.register_parameter("0", par)
+    tensors = torch.jit.script(m)
+    tensors.save(path)
+
+    logging.info(f"Saved image tensor to {path}")
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--image-path",
+        required=True,
+        help="Path to the image.",
+    )
+    parser.add_argument(
+        "--output-path",
+        default="image.pt",
+    )
+    args = parser.parse_args()
+
+    image = Image.open(args.image_path)
+    image_tensor = prepare_image(image, target_h=336, target_w=336)
+    serialize_image(image_tensor, args.output_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/llava/install_requirements.sh b/examples/models/llava/install_requirements.sh
index 3bf803b356c..931d63b3919 100644
--- a/examples/models/llava/install_requirements.sh
+++ b/examples/models/llava/install_requirements.sh
@@ -7,6 +7,6 @@
 
 set -x
 
-pip install transformers accelerate
+pip install transformers accelerate sentencepiece
 
 pip list
diff --git a/examples/models/llava/llava_via_xnnpack.gif b/examples/models/llava/llava_via_xnnpack.gif
new file mode 100644
index 00000000000..83c32cf0465
Binary files /dev/null and b/examples/models/llava/llava_via_xnnpack.gif differ
diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp
index 6f4911180ca..a9493b5e854 100644
--- a/examples/models/llava/main.cpp
+++ b/examples/models/llava/main.cpp
@@ -8,11 +8,15 @@
 
 #include <executorch/examples/models/llava/runner/llava_runner.h>
 #include <gflags/gflags.h>
+#ifndef LLAVA_NO_TORCH_DUMMY_IMAGE
 #include <torch/torch.h>
+#else
+#include <algorithm> // std::fill
+#endif
 
 #if defined(ET_USE_THREADPOOL)
-#include <executorch/backends/xnnpack/threadpool/cpuinfo_utils.h>
-#include <executorch/backends/xnnpack/threadpool/threadpool.h>
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
+#include <executorch/extension/threadpool/threadpool.h>
 #endif
 
 DEFINE_string(
@@ -44,6 +48,8 @@ DEFINE_int32(
     -1,
     "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
 
+using executorch::extension::llm::Image;
+
 int32_t main(int32_t argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
@@ -76,10 +82,19 @@ int32_t main(int32_t argc, char** argv) {
   }
 #endif
   // create llama runner
-  torch::executor::LlavaRunner runner(model_path, tokenizer_path, temperature);
+  example::LlavaRunner runner(model_path, tokenizer_path, temperature);
 
   // read image and resize the longest edge to 336
   std::vector<uint8_t> image_data;
+
+#ifdef LLAVA_NO_TORCH_DUMMY_IMAGE
+  // Work without torch using a random data
+  image_data.resize(3 * 240 * 336);
+  std::fill(image_data.begin(), image_data.end(), 0); // black
+  std::array<int32_t, 3> image_shape = {3, 240, 336};
+  std::vector<Image> images = {
+      {.data = image_data, .width = image_shape[2], .height = image_shape[1]}};
+#else //  LLAVA_NO_TORCH_DUMMY_IMAGE
   //   cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
   //   int longest_edge = std::max(image.rows, image.cols);
   //   float scale_factor = 336.0f / longest_edge;
@@ -98,11 +113,13 @@ int32_t main(int32_t argc, char** argv) {
   image_data.assign(
       image_tensor.data_ptr<uint8_t>(),
       image_tensor.data_ptr<uint8_t>() + image_tensor.numel());
-  std::vector<torch::executor::Image> images = {
+  std::vector<Image> images = {
       {.data = image_data,
        .width = static_cast<int32_t>(image_tensor.size(2)),
        .height = static_cast<int32_t>(image_tensor.size(1))}};
+#endif // LLAVA_NO_TORCH_DUMMY_IMAGE
+
   // generate
-  runner.generate(images, prompt, seq_len);
+  runner.generate(std::move(images), prompt, seq_len);
   return 0;
 }
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
index 9f6d8d32e8e..8dcf286727b 100644
--- a/examples/models/llava/model.py
+++ b/examples/models/llava/model.py
@@ -6,24 +6,21 @@
 
 # An ExecuTorch friendly implementation of Llava-1.5.
 
-import math
-
 import re
 
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Tuple
 
 import requests
 import torch
-import torchvision
 from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer
 
 from executorch.examples.models.llama2.source_transformation.sdpa import (
     replace_sdpa_with_custom_op,
 )
+from executorch.examples.models.llava.image_util import prepare_image
 from executorch.examples.models.model_base import EagerModelBase
 from PIL import Image
 
-from torch import nn
 from torch.export import Dim
 from torchvision.transforms.v2 import functional as F
 
@@ -41,6 +38,7 @@ def __init__(
         llava_model: LlavaForConditionalGeneration,
         image_processor: CLIPImageProcessor,
         use_sdpa_with_kv_cache_op: bool = True,
+        max_seq_len: int = 768,
     ):
         super().__init__()
         self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op
@@ -59,11 +57,7 @@ def __init__(
             enable_dynamic_shape=True,  # allow parallel prefill
             use_sdpa_with_kv_cache_op=use_sdpa_with_kv_cache_op,  # use sdpa_with_kv_cache op
             use_hf_rope=True,
-        )
-        self.embed_tokens = nn.Embedding(
-            self.model_.config.text_config.vocab_size,
-            self.model_.config.text_config.hidden_size,
-            self.model_.config.pad_token_id,
+            max_seq_len=max_seq_len,
         )
         self.text_model = Transformer(self.text_model_args)
         # use custom op for SDPA.
@@ -75,11 +69,6 @@ def __init__(
             strict=False,
             assign=True,
         )
-        self.embed_tokens.load_state_dict(
-            state_dict=self.model_.language_model.model.embed_tokens.state_dict(),
-            strict=True,
-            assign=True,
-        )
 
     def _translate_state_dict_for_text_model(self) -> Dict[str, Any]:
         state_dict = self.model_.language_model.state_dict()
@@ -133,6 +122,9 @@ def _feature_select(self, image_outputs):
     def get_model(self):
         return self.model_.get_model()
 
+    def embed_tokens(self, tokens: torch.Tensor) -> torch.Tensor:
+        return self.model_.language_model.model.embed_tokens(tokens)
+
     def encode_images(self, images: torch.Tensor) -> torch.Tensor:
         images = images.to(dtype=self.model_.dtype)
         if type(images) is list:
@@ -156,19 +148,32 @@ def encode_images(self, images: torch.Tensor) -> torch.Tensor:
         return image_features
 
     def image_preprocess(self, img: torch.Tensor) -> torch.Tensor:
-        w = max(img.shape[1], img.shape[2])
+        target_h = self.image_processor.crop_size["height"]
+        target_w = self.image_processor.crop_size["width"]
         # pad the image with median rgb value, to make a square
-        v_padding = (w - img.shape[1]) / 2
-        h_padding = (w - img.shape[2]) / 2
-        l_pad = int(math.ceil(h_padding))
-        t_pad = int(math.ceil(v_padding))
-        r_pad = int(math.floor(h_padding))
-        b_pad = int(math.floor(v_padding))
-        resized = F.pad(
+        l_pad = (target_w - img.shape[2]) // 2
+        t_pad = (target_h - img.shape[1]) // 2
+        # ceil division
+        r_pad = -((target_w - img.shape[2]) // -2)
+        b_pad = -((target_h - img.shape[1]) // -2)
+
+        torch._check(l_pad >= 0)
+        torch._check(t_pad >= 0)
+        torch._check(r_pad >= 0)
+        torch._check(b_pad >= 0)
+
+        # This is different from the original implementation, due to export limitations.
+        resized = torch.nn.functional.pad(
             img,
-            padding=(l_pad, t_pad, r_pad, b_pad),
-            fill=tuple(int(x * 255) for x in self.image_processor.image_mean),
+            (l_pad, r_pad, t_pad, b_pad),
         )
+        # originally:
+        # resized = F.pad(
+        #     img,
+        #     padding=(l_pad, t_pad, r_pad, b_pad),
+        #     fill=tuple(int(x * 255) for x in self.image_mean),
+        # )
+
         # TODO: implement _upsample_bicubic_aa.out in portable kernel library.
         # here padded shape should be max(h, w) x max(h, w)
         # skipping resize for now due to missing _upsample_bicubic_aa kernel in portable
@@ -216,16 +221,19 @@ def prefill_embedding(
         result = torch.cat((embeds_before_img, image_embeds, embeds_after_img), dim=1)
         return result
 
+    # prefill using the in house text_model of llama transformer
     def prefill(
         self,
         prompt_before_image: torch.Tensor,
         images: torch.Tensor,
         prompt_after_image: torch.Tensor,
-    ) -> torch.Tensor:
+    ) -> Tuple[int, torch.Tensor]:
         """Avoiding the torch.where() call to find <image> placeholder and insert image embedding. Taking 3 inputs instead."""
         embeds = self.prefill_embedding(prompt_before_image, images, prompt_after_image)
-        return self.text_model.forward(None, torch.tensor([0]), embeds)
+        # returns the prefilled token length too, because the text model generates one logits in each forward call.
+        return embeds.shape[1], self.text_model.forward(None, torch.tensor([0]), embeds)
 
+    # reference prefill using the text model in HF
     def prefill_ref(
         self,
         prompt_before_image: torch.Tensor,
@@ -250,8 +258,9 @@ def forward(
 
 
 class LlavaModel(EagerModelBase):
-    def __init__(self, use_sdpa_with_kv_cache_op=True):
+    def __init__(self, use_sdpa_with_kv_cache_op=True, max_seq_len=768):
         self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op
+        self.max_seq_len = max_seq_len
         self.processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
         self.tokenizer = self.processor.tokenizer
         self.image_processor = self.processor.image_processor
@@ -276,6 +285,7 @@ def get_eager_model(self):
             self.model,
             self.image_processor,
             self.use_sdpa_with_kv_cache_op,
+            self.max_seq_len,
         )
         model.to(dtype=torch.float32)
         return model
@@ -284,13 +294,12 @@ def get_example_inputs(self):
         """Returns a resized image as input to model.forward()."""
         if self.resized_image:
             return self.resized_image
-        imagr = torchvision.transforms.functional.pil_to_tensor(self.image)
-        ratio = (
-            max(imagr.shape[1], imagr.shape[2])
-            / self.image_processor.crop_size["height"]
+        resized = prepare_image(
+            self.image,
+            self.image_processor.crop_size["height"],
+            self.image_processor.crop_size["width"],
         )
-        output_size = (int(imagr.shape[1] / ratio), int(imagr.shape[2] / ratio))
-        self.resized_image = (torchvision.transforms.Resize(size=output_size)(imagr),)
+        self.resized_image = (resized,)
         return self.resized_image
 
     def get_inputs_for_prefill(self):
@@ -314,12 +323,17 @@ def get_dynamic_shapes(self):
         return self._get_image_dynamic_shapes()
 
     def _get_image_dynamic_shapes(self):
-        height = Dim("height", min=8, max=336)
-        width = Dim("width", min=28, max=336)
+        # only support even number of height and width for now
+        _height = Dim(
+            "_height", min=1, max=self.image_processor.crop_size["height"] // 2
+        )
+        _width = Dim("_width", min=1, max=self.image_processor.crop_size["width"] // 2)
+        height = 2 * _height
+        width = 2 * _width
         dynamic_shapes = [{1: height, 2: width}]
         return dynamic_shapes
 
     def _get_prompt_dynamic_shapes(self):
-        dim = torch.export.Dim("token_dim", min=2, max=2048)
+        dim = torch.export.Dim("token_dim", min=2, max=self.max_seq_len)
         text_model_dynamic_shapes = ({0: 1}, {1: dim})
         return text_model_dynamic_shapes
diff --git a/examples/models/llava/runner/CMakeLists.txt b/examples/models/llava/runner/CMakeLists.txt
index 564d31f8e77..2d0c30a620e 100644
--- a/examples/models/llava/runner/CMakeLists.txt
+++ b/examples/models/llava/runner/CMakeLists.txt
@@ -40,8 +40,8 @@ add_subdirectory(
 
 add_library(llava_runner STATIC ${_llava_runner__srcs})
 
-set(llava_runner_deps executorch extension_module extension_data_loader
-                      extension_llm_runner
+set(llava_runner_deps executorch extension_data_loader extension_llm_runner
+                      extension_module extension_tensor
 )
 
 target_link_libraries(llava_runner PUBLIC ${llava_runner_deps})
diff --git a/examples/models/llava/runner/llava_image_prefiller.h b/examples/models/llava/runner/llava_image_prefiller.h
index e8453299085..ace28ac2c1f 100644
--- a/examples/models/llava/runner/llava_image_prefiller.h
+++ b/examples/models/llava/runner/llava_image_prefiller.h
@@ -11,38 +11,47 @@
 #pragma once
 
 #include <executorch/extension/llm/runner/image_prefiller.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/extension/tensor/tensor.h>
 
-namespace torch::executor {
+namespace example {
 
-class LlavaImagePrefiller : public ImagePrefiller {
+class LlavaImagePrefiller
+    : public ::executorch::extension::llm::ImagePrefiller {
  public:
-  LlavaImagePrefiller(Module* module) : ImagePrefiller(module){};
+  LlavaImagePrefiller(::executorch::extension::Module* module)
+      : ImagePrefiller(module){};
   /**
    * Prefill an LLM Module with the given image input.
    * @param image The image input to LLaVa.
    * @param start_pos The starting position in KV cache of the input in the LLM
    * @return logits of the image prefill.
    */
-  inline Result<exec_aten::Tensor> prefill(Image& image, int64_t start_pos = 0)
-      override {
-    ManagedTensor managed_images(
-        image.data.data(), {3, image.height, image.width}, ScalarType::Byte);
+  inline ::executorch::runtime::Result<exec_aten::Tensor> prefill(
+      ::executorch::extension::llm::Image& image,
+      int64_t& start_pos) override {
+    auto image_tensor = executorch::extension::from_blob(
+        image.data.data(),
+        {3, image.height, image.width},
+        ::executorch::aten::ScalarType::Byte);
     // Run image encoder
-    std::vector<EValue> image_encoder_outputs = ET_UNWRAP(module_->execute(
-        kImageEncoderMethod, {managed_images.get_aliasing_tensor()}));
+    auto image_encoder_outputs =
+        ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
 
     // inputs:[start_pos, embeds]
-    ManagedTensor managed_start_pos(&start_pos, {1}, ScalarType::Long);
-    auto start_pos_tensor = managed_start_pos.get_aliasing_tensor();
+    auto start_pos_tensor = executorch::extension::from_blob(
+        &start_pos, {1}, ::executorch::aten::ScalarType::Long);
 
     // Run text model
-    std::vector<EValue> outputs_res = ET_UNWRAP(module_->execute(
+    auto outputs_res = ET_UNWRAP(module_->execute(
         kTextModelMethod, {start_pos_tensor, image_encoder_outputs[0]}));
     ET_CHECK_MSG(
         outputs_res[0].isTensor(),
         "Non Tensor Output returned from executing image prefill");
 
+    // Update the start_pos, which is only available inside this function.
+    // outputs_res can have only one logits.
+    start_pos += image_encoder_outputs[0].toTensor().size(1);
+
     return outputs_res[0].toTensor();
   }
 
@@ -50,13 +59,13 @@ class LlavaImagePrefiller : public ImagePrefiller {
    * Load the Module for image prefill purpose.
    * @return The error code.
    */
-  inline Error load() override {
+  inline ::executorch::runtime::Error load() override {
     if (is_method_loaded()) {
-      return Error::Ok;
+      return ::executorch::runtime::Error::Ok;
     }
     ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
     ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
-    return Error::Ok;
+    return ::executorch::runtime::Error::Ok;
   }
 
   /**
@@ -64,9 +73,9 @@ class LlavaImagePrefiller : public ImagePrefiller {
    * @return True if the Module is loaded, false otherwise.
    */
   inline bool is_method_loaded() override {
-    Result<std::unordered_set<std::string>> methods_res =
+    ::executorch::runtime::Result<std::unordered_set<std::string>> methods_res =
         module_->method_names();
-    if (methods_res.error() != Error::Ok) {
+    if (methods_res.error() != ::executorch::runtime::Error::Ok) {
       ET_CHECK_MSG(false, "Failed to get method names");
     }
     std::unordered_set<std::string> methods = methods_res.get();
@@ -91,4 +100,4 @@ class LlavaImagePrefiller : public ImagePrefiller {
   inline static const std::string kTextModelMethod = "text_model";
 };
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp
index c5ce03b88d7..b3c0cce5c33 100644
--- a/examples/models/llava/runner/llava_runner.cpp
+++ b/examples/models/llava/runner/llava_runner.cpp
@@ -20,7 +20,11 @@
 #include <sstream>
 #include <vector>
 
-namespace torch::executor {
+namespace llm = ::executorch::extension::llm;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+namespace example {
 
 bool LlavaRunner::is_loaded() {
   bool instantiated = tokenizer_ && text_decoder_runner_ && text_prefiller_ &&
@@ -36,10 +40,10 @@ Error LlavaRunner::load() {
   if (is_loaded()) {
     return Error::Ok;
   }
-  stats_.model_load_start_ms = util::time_in_ms();
+  stats_.model_load_start_ms = llm::time_in_ms();
 
   // Load the tokenizer
-  tokenizer_ = std::make_unique<BPETokenizer>();
+  tokenizer_ = std::make_unique<llm::BPETokenizer>();
   tokenizer_->load(tokenizer_path_);
 
   // Load the text decoder runner
@@ -48,8 +52,7 @@ Error LlavaRunner::load() {
   text_decoder_runner_->load();
 
   // Load the text prefiller
-  text_prefiller_ = std::make_unique<TextPrefiller>(
-      tokenizer_.get(),
+  text_prefiller_ = std::make_unique<llm::TextPrefiller>(
       text_decoder_runner_.get(),
       /*use_kv_cache=*/true,
       /*enable_parallel_prefill=*/true);
@@ -59,7 +62,7 @@ Error LlavaRunner::load() {
   image_prefiller_->load();
 
   // Load the text token generator
-  text_token_generator_ = std::make_unique<TextTokenGenerator>(
+  text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
       tokenizer_.get(),
       text_decoder_runner_.get(),
       /*use_kv_cache=*/true,
@@ -67,25 +70,83 @@ Error LlavaRunner::load() {
           std::unordered_set<uint64_t>{tokenizer_->eos_tok()}),
       &stats_);
 
-  stats_.model_load_end_ms = util::time_in_ms();
+  stats_.model_load_end_ms = llm::time_in_ms();
+  return Error::Ok;
+}
+
+Error LlavaRunner::prefill_images(
+    std::vector<llm::Image>& images,
+    int64_t& start_pos) {
+  for (auto& image : images) {
+    // pos is updated inside image prefill.
+    ET_UNWRAP(image_prefiller_->prefill(image, start_pos));
+  }
+  return Error::Ok;
+}
+
+Result<uint64_t> LlavaRunner::prefill_prompt(
+    const std::string& prompt,
+    int64_t& start_pos,
+    int8_t bos,
+    int8_t eos) {
+  std::vector<uint64_t> prompt_tokens =
+      ET_UNWRAP(tokenizer_->encode(prompt, bos, eos));
+
+  return text_prefiller_->prefill(prompt_tokens, start_pos);
+}
+
+Error LlavaRunner::generate_from_pos(
+    const std::string& prompt,
+    int32_t seq_len,
+    int64_t start_pos,
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const ::executorch::extension::llm::Stats&)>
+        stats_callback,
+    bool echo) {
+  // prefill user prompt. No BOS because preset prompt already has it.
+  if (echo) {
+    token_callback(prompt);
+  }
+
+  uint64_t prefill_next_token =
+      ET_UNWRAP(prefill_prompt(prompt, start_pos, /*bos=*/0, /*eos*/ 0));
+  stats_.first_token_ms = llm::time_in_ms();
+  stats_.prompt_eval_end_ms = llm::time_in_ms();
+  stats_.num_prompt_tokens = start_pos;
+
+  // Generate tokens
+  int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
+      {prefill_next_token}, start_pos, seq_len, token_callback));
+
+  // Bookkeeping
+  stats_.num_generated_tokens = num_generated_tokens;
+  if (stats_callback) {
+    stats_callback(stats_);
+  }
   return Error::Ok;
 }
 
 Error LlavaRunner::generate(
-    std::vector<Image>& images,
+    std::vector<llm::Image> images,
     const std::string& prompt,
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
+    std::function<void(const llm::Stats&)> stats_callback,
+    bool echo) {
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
   if (!is_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
   }
 
+  ET_LOG(
+      Info,
+      "RSS after loading model: %f MiB (0 if unsupported)",
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
+
   // Wrap the token_callback with print function
   std::function<void(const std::string&)> wrapped_callback =
       [token_callback](const std::string& piece) {
-        util::safe_printf(piece.c_str());
+        llm::safe_printf(piece.c_str());
         fflush(stdout);
         if (token_callback) {
           token_callback(piece);
@@ -93,43 +154,32 @@ Error LlavaRunner::generate(
       };
 
   int64_t pos = 0;
+  stats_.inference_start_ms = llm::time_in_ms();
 
   // prefill preset prompt
-  std::vector<uint64_t> preset_prompt_tokens =
-      ET_UNWRAP(tokenizer_->encode(kPresetPrompt, /*bos=*/1, /*eos=*/0));
-  size_t num_preset_tokens = preset_prompt_tokens.size();
-
-  ET_UNWRAP(text_prefiller_->prefill(preset_prompt_tokens, pos));
-  pos += num_preset_tokens;
+  prefill_prompt(kPresetPrompt, pos, /*bos=*/1, /*eos*/ 0);
 
   // prefill images
-  for (auto& image : images) {
-    auto logits = ET_UNWRAP(image_prefiller_->prefill(image, pos));
-    pos += logits.size(1);
-  }
+  prefill_images(images, pos);
 
-  // prefill user prompt. No BOS because preset prompt already has it.
-  std::vector<uint64_t> user_prompt_tokens =
-      ET_UNWRAP(tokenizer_->encode(prompt, /*bos=*/0, /*eos=*/0));
-  size_t num_user_tokens = user_prompt_tokens.size();
-
-  uint64_t prefill_next_token = ET_UNWRAP(
-      text_prefiller_->prefill(user_prompt_tokens, pos, wrapped_callback));
-  pos += num_user_tokens;
+  ET_LOG(
+      Info,
+      "RSS after prompt and image prefill: %f MiB (0 if unsupported)",
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   // Generate tokens
-  int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
-      {prefill_next_token}, pos, seq_len, wrapped_callback));
+  Error err = generate_from_pos(
+      prompt, seq_len, pos, wrapped_callback, stats_callback, echo);
 
-  // Bookkeeping
-  stats_.num_prompt_tokens = num_preset_tokens + num_user_tokens;
-  stats_.num_generated_tokens = num_generated_tokens;
+  stats_.inference_end_ms = llm::time_in_ms();
   ::executorch::llm::print_report(stats_);
-  if (stats_callback) {
-    stats_callback(stats_);
-  }
 
-  return Error::Ok;
+  ET_LOG(
+      Info,
+      "RSS after finishing text generation: %f MiB (0 if unsupported)",
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
+
+  return err;
 }
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/llava/runner/llava_runner.h b/examples/models/llava/runner/llava_runner.h
index d9805a0c917..79cc22fb240 100644
--- a/examples/models/llava/runner/llava_runner.h
+++ b/examples/models/llava/runner/llava_runner.h
@@ -19,9 +19,9 @@
 
 #include <executorch/extension/llm/runner/multimodal_runner.h>
 
-namespace torch::executor {
+namespace example {
 
-class LlavaRunner : public MultimodalRunner {
+class LlavaRunner : public ::executorch::extension::llm::MultimodalRunner {
  public:
   explicit LlavaRunner(
       const std::string& model_path,
@@ -29,17 +29,65 @@ class LlavaRunner : public MultimodalRunner {
       const float temperature = 0.8f)
       : MultimodalRunner(model_path, tokenizer_path, temperature){};
   bool is_loaded();
-  Error load();
-  Error generate(
-      std::vector<Image>& images,
+  ::executorch::runtime::Error load();
+  ::executorch::runtime::Error generate(
+      std::vector<::executorch::extension::llm::Image> images,
       const std::string& prompt,
       int32_t seq_len = 1024,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {});
+      std::function<void(const ::executorch::extension::llm::Stats&)>
+          stats_callback = {},
+      bool echo = true);
+
+  /**
+   * Prefill an LLaVA Module with the given images input.
+   * @param images The image input to LLaVA.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * It's passed as reference and will be updated inside this function.
+   * @return The error status of prefilling images.
+   */
+  ::executorch::runtime::Error prefill_images(
+      std::vector<::executorch::extension::llm::Image>& images,
+      int64_t& start_pos);
+
+  /**
+   * Prefill an LLaVA Module with the given text input.
+   * @param prompt The text prompt to LLaVA.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * It's passed as reference and will be updated inside this function.
+   * @param bos The number of BOS (begin of sequence) token.
+   * @param eos The number of EOS (end of sequence) token.
+   * @return The generated token of the LLaVA Module after prefill prompt.
+   */
+  ::executorch::runtime::Result<uint64_t> prefill_prompt(
+      const std::string& prompt,
+      int64_t& start_pos,
+      int8_t bos = 0,
+      int8_t eos = 0);
+
+  /**
+   * Generate tokens from the given prompt, starting from the given position.
+   * @param prompt The text prompt to LLaVA.
+   * @param seq_len The total sequence length, including the prompt tokens and
+   * new tokens.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * @param token_callback What to do after a token is generated.
+   * @param stats_callback What to do with Stats.
+   * @param echo Whether to echo the input prompt or not.
+   * @return The error code.
+   */
+  ::executorch::runtime::Error generate_from_pos(
+      const std::string& prompt,
+      int32_t seq_len = 1024,
+      int64_t start_pos = 0,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const ::executorch::extension::llm::Stats&)>
+          stats_callback = {},
+      bool echo = true);
 
  private:
   inline static const std::string kPresetPrompt =
       "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: ";
 };
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h
index 8303b295f53..b2ee56f321a 100644
--- a/examples/models/llava/runner/llava_text_decoder_runner.h
+++ b/examples/models/llava/runner/llava_text_decoder_runner.h
@@ -12,25 +12,26 @@
 
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
 
-namespace torch::executor {
+namespace example {
 
-class LlavaTextDecoderRunner : public TextDecoderRunner {
+class LlavaTextDecoderRunner
+    : public executorch::extension::llm::TextDecoderRunner {
  public:
-  LlavaTextDecoderRunner(Module* module, int32_t vocab_size, float temperature)
+  LlavaTextDecoderRunner(
+      executorch::extension::Module* module,
+      int32_t vocab_size,
+      float temperature)
       : TextDecoderRunner(module, true, vocab_size, temperature){};
 
-  inline Result<exec_aten::Tensor> step(
-      ManagedTensor& managed_tokens,
-      ManagedTensor& managed_start_pos) override {
-    auto tokens = managed_tokens.get_aliasing_tensor();
-    auto start_pos = managed_start_pos.get_aliasing_tensor();
-
+  inline executorch::runtime::Result<exec_aten::Tensor> step(
+      executorch::extension::TensorPtr& tokens,
+      executorch::extension::TensorPtr& start_pos) override {
     // run token embedding
-    std::vector<EValue> token_embedding_outputs =
-        ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, {tokens}));
+    auto token_embedding_outputs =
+        ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, tokens));
 
     // run text model
-    std::vector<EValue> outputs_res = ET_UNWRAP(module_->execute(
+    auto outputs_res = ET_UNWRAP(module_->execute(
         kTextModelMethod, {start_pos, token_embedding_outputs[0]}));
 
     ET_CHECK_MSG(
@@ -48,13 +49,13 @@ class LlavaTextDecoderRunner : public TextDecoderRunner {
    * Load the Module for text decode purpose.
    * @return The error code.
    */
-  inline Error load() override {
+  inline executorch::runtime::Error load() override {
     if (is_method_loaded()) {
-      return Error::Ok;
+      return executorch::runtime::Error::Ok;
     }
     ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTokenEmbeddingMethod));
     ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
-    return Error::Ok;
+    return executorch::runtime::Error::Ok;
   }
 
   /**
@@ -62,9 +63,9 @@ class LlavaTextDecoderRunner : public TextDecoderRunner {
    * @return True if the Module is loaded, false otherwise.
    */
   inline bool is_method_loaded() override {
-    Result<std::unordered_set<std::string>> methods_res =
+    executorch::runtime::Result<std::unordered_set<std::string>> methods_res =
         module_->method_names();
-    if (methods_res.error() != Error::Ok) {
+    if (methods_res.error() != executorch::runtime::Error::Ok) {
       ET_CHECK_MSG(false, "Failed to get method names");
     }
     std::unordered_set<std::string> methods = methods_res.get();
@@ -89,4 +90,4 @@ class LlavaTextDecoderRunner : public TextDecoderRunner {
   inline static const std::string kTextModelMethod = "text_model";
 };
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/llava/runner/targets.bzl b/examples/models/llava/runner/targets.bzl
index 435ab2a8c70..c7523d6cc45 100644
--- a/examples/models/llava/runner/targets.bzl
+++ b/examples/models/llava/runner/targets.bzl
@@ -8,13 +8,16 @@ def define_common_targets():
         visibility = [
             "@EXECUTORCH_CLIENTS",
         ],
+        compiler_flags = [
+            "-Wno-global-constructors",
+        ],
         exported_deps = [
             "//executorch/backends/xnnpack:xnnpack_backend",
             "//executorch/extension/llm/runner:runner_lib",
             "//executorch/extension/llm/tokenizer:bpe_tokenizer",
             "//executorch/extension/evalue_util:print_evalue",
-            "//executorch/extension/runner_util:managed_tensor",
             "//executorch/extension/module:module",
+            "//executorch/extension/tensor:tensor",
             "//executorch/kernels/quantized:generated_lib",
             "//executorch/runtime/core/exec_aten:lib",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py
index ef503a88fc3..2e50bcecf49 100644
--- a/examples/models/llava/test/test_llava.py
+++ b/examples/models/llava/test/test_llava.py
@@ -15,12 +15,11 @@
 # import order matters. We need to import portable_lib first since it contains the static op registry
 # which will be used in the import of custom ops. Otherwise, the registration of custom ops will be skipped.
 # I don't know how to mute UFMT so I'm just using if True: to avoid the error
-if True:
-    from executorch.extension.pybindings.portable_lib import (
-        _load_for_executorch_from_buffer,
-    )
-from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa: F401
-
+from executorch.extension.pybindings.portable_lib import (
+    _load_for_executorch_from_buffer,
+)
+from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
+from executorch.kernels import quantized  # noqa # usort: skip
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -35,12 +34,14 @@ def setUp(self):
         )
 
     def test_prefill_logits(self):
-        prefill_logits = self.llava.prefill(
+        # For efficiency, the implemented prefill function only outputs the last logits.
+        _, prefill_logits = self.llava.prefill(
             self.prompt_before_image, self.resized, self.prompt_after_image
         )
+        # The reference implementation in HF genetates the full logits. Get the last one.
         prefill_logits_ref = self.llava.prefill_ref(
             self.prompt_before_image, self.resized, self.prompt_after_image
-        )[0]
+        )[0][:, -1, :]
         self.assertTrue(torch.allclose(prefill_logits, prefill_logits_ref, atol=3e-2))
 
     def test_generated_output(self):
@@ -62,11 +63,11 @@ def test_generated_output(self):
         )[0].strip()
 
         # being tested, using llama_transformer
-        prefill_logits = self.llava.prefill(
+        context_len, prefill_logits = self.llava.prefill(
             self.prompt_before_image, self.resized, self.prompt_after_image
         )
-        context_len = prefill_logits.shape[1]
-        new_tokens = [torch.argmax(prefill_logits[..., -1, :]).item()]
+        # Always generate one token at a time.
+        new_tokens = [torch.argmax(prefill_logits).item()]
         for i in range(4):
             logits = self.llava.step(
                 torch.tensor([new_tokens[i]]), torch.tensor([context_len + i])
@@ -93,24 +94,27 @@ def test_llava_export(self):
         pte_embeds_before_img = llava_module.run_method(
             "token_embedding", (prompt_before_image,)
         )[0]
-        pte_prefill_before_img = llava_module.run_method(
+        llava_module.run_method(
             "text_model",
             (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_before_img),
-        )[0]
+        )
 
-        start_pos += pte_prefill_before_img.shape[1]
+        # Update the start_pos. start_pos is used in kv cache. The source of truth
+        # of the delta length is from the embeddings, not from the logits.
+        start_pos += pte_embeds_before_img.shape[1]
 
         # pte prefill image
         pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
-        pte_prefill_img = llava_module.run_method(
+        llava_module.run_method(
             "text_model",
             (
                 torch.tensor([start_pos], dtype=torch.int64),
                 pte_embeds_img,
             ),
-        )[0]
+        )
 
-        start_pos += pte_prefill_img.shape[1]
+        # Update the logits for each prefill (kv cache) step.
+        start_pos += pte_embeds_img.shape[1]
 
         # pte prefill prompt after img
         pte_embeds_after_img = llava_module.run_method(
@@ -121,8 +125,11 @@ def test_llava_export(self):
             (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img),
         )[0]
 
+        # Update the logits for each prefill (kv cache) step.
+        start_pos += pte_embeds_after_img.shape[1]
+
         # being tested, using llama_transformer
-        new_tokens = [torch.argmax(pte_prefill_after_img[..., -1, :]).item()]
+        new_tokens = [torch.argmax(pte_prefill_after_img).item()]
         # TODO: uncomment this line
         # self.assertEquals(new_tokens[0], 1932)  # When
         for i in range(4):
@@ -134,7 +141,7 @@ def test_llava_export(self):
                 "text_model",
                 (torch.tensor([start_pos + i], dtype=torch.int64), token_embeds),
             )[0]
-            new_tokens.append(torch.argmax(logits[..., -1, :]).item())
+            new_tokens.append(torch.argmax(logits).item())
 
         outputs = llava_model.tokenizer.batch_decode(
             torch.tensor([new_tokens]), skip_special_tokens=True
diff --git a/examples/models/llava/test/test_pte.py b/examples/models/llava/test/test_pte.py
index cdf24761c59..003b2b56755 100644
--- a/examples/models/llava/test/test_pte.py
+++ b/examples/models/llava/test/test_pte.py
@@ -4,21 +4,31 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
 import sys
 
 import torch
-
+from executorch.examples.models.llava.image_util import prepare_image
 from executorch.examples.models.llava.model import LlavaModel
 from executorch.extension.pybindings.portable_lib import _load_for_executorch
+from PIL import Image
 
 # Custom ops has to be loaded after portable_lib.
-# I don't know how to stop UFMT so I'm just using if True: to avoid lint error
-if True:
-    from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa
+from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
+from executorch.kernels import quantized  # noqa # usort: skip
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.DEBUG, format=FORMAT)
 
 
 def main():
     args = sys.argv[1:]
+    if len(args) == 0:
+        print(
+            "Usage: python test_pte.py <model_path> <image_path?>. If no image, will use default image."
+        )
+        sys.exit(1)
+
     llava_module = _load_for_executorch(args[0])
 
     llava_model = LlavaModel()
@@ -26,6 +36,10 @@ def main():
     prompt_before_image, resized, prompt_after_image = (
         llava_model.get_inputs_for_prefill()
     )
+    if len(args) == 2:
+        image_path = args[1]
+        image = Image.open(image_path)
+        resized = prepare_image(image, target_h=336, target_w=336)
 
     start_pos = 0
     # pte prefill prompt before img
@@ -38,10 +52,13 @@ def main():
     )[0]
     print(pte_prefill_before_img)
 
-    start_pos += pte_prefill_before_img.shape[1]
+    start_pos += prompt_before_image.shape[1]
 
     # pte prefill image
+    logging.warning("Image encoder started")
     pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
+    logging.warning("Image encoder finished")
+    logging.warning("Image token prefill started")
     pte_prefill_img = llava_module.run_method(
         "text_model",
         (
@@ -49,11 +66,13 @@ def main():
             pte_embeds_img,
         ),
     )[0]
+    logging.warning("Image token prefill finished")
     print(pte_prefill_img)
 
-    start_pos += pte_prefill_img.shape[1]
+    start_pos += pte_embeds_img.shape[1]
 
     # pte prefill prompt after img
+    logging.warning("Text token prefill started")
     pte_embeds_after_img = llava_module.run_method(
         "token_embedding", (prompt_after_image,)
     )[0]
@@ -61,6 +80,7 @@ def main():
         "text_model",
         (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img),
     )[0]
+    logging.warning("Text token prefill finished")
     print(pte_prefill_after_img)
 
     # being tested, using llama_transformer
diff --git a/examples/models/phi3-mini-lora/README.md b/examples/models/phi-3-mini-lora/README.md
similarity index 55%
rename from examples/models/phi3-mini-lora/README.md
rename to examples/models/phi-3-mini-lora/README.md
index 3811fc5438c..92f23f137b4 100644
--- a/examples/models/phi3-mini-lora/README.md
+++ b/examples/models/phi-3-mini-lora/README.md
@@ -1,12 +1,14 @@
 ## Summary
-In this example, we export to ExecuTorch a model ([phi-3-mini](https://github.com/pytorch/executorch/tree/main/examples/models/phi-3-mini)) appended with attention and mlp LoRA layers. The model is exported to ExecuTorch for both inference and training. Note: the exported training model can only train at the moment.
+In this example, we showcase how to export a model ([phi-3-mini](https://github.com/pytorch/executorch/tree/main/examples/models/phi-3-mini)) appended with LoRA layers to ExecuTorch. The model is exported to ExecuTorch for both inference and training.
+
+To see how you can use the model exported for training in a fully involved finetuning loop, please see our example on [LLM PTE Fintetuning](https://github.com/pytorch/executorch/tree/main/examples/llm_pte_finetuning).
 
 ## Instructions
 ### Step 1: [Optional] Install ExecuTorch dependencies
 `./install_requirements.sh` in ExecuTorch root directory.
 
 ### Step 2: Install Requirements
-- `./examples/models/phi3-mini-lora/install_requirements.sh`
+- `./examples/models/phi-3-mini-lora/install_requirements.sh`
 
 ### Step 3: Export and run the model
 1. Export the inferenace and training models to ExecuTorch.
@@ -22,5 +24,6 @@ python export_model.py
 # Build the executor_runner target
 cmake --build cmake-out --target executor_runner -j9
 
-./cmake-out/executor_runner --model_path mini_phi3_lora.pte
+# Run the model for inference.
+./cmake-out/executor_runner --model_path phi3_mini_lora.pte
 ```
diff --git a/examples/models/phi3-mini-lora/export_model.py b/examples/models/phi-3-mini-lora/export_model.py
similarity index 86%
rename from examples/models/phi3-mini-lora/export_model.py
rename to examples/models/phi-3-mini-lora/export_model.py
index eb8fbc07fe8..e6f291bd581 100644
--- a/examples/models/phi3-mini-lora/export_model.py
+++ b/examples/models/phi-3-mini-lora/export_model.py
@@ -28,11 +28,13 @@ def __init__(self, model, loss):
         self.model = model
         self.loss = loss
 
-    def forward(self, input):
+    def forward(self, input: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
         # Output is of the shape (seq_len, vocab_size).
-        output = self.model(input)
-        target = zeros((1, vocab_size), dtype=long)
-        return self.loss(output, target)
+        logits = self.model(input)
+        logits = logits[..., :-1, :].contiguous()
+        labels = labels[..., 1:].contiguous()
+        logits = logits.transpose(1, 2)
+        return self.loss(logits, labels)
 
 
 @no_grad()
@@ -47,7 +49,11 @@ def export_phi3_mini_lora(model) -> None:
     model.eval()
     # 1. torch.export: Defines the program with the ATen operator set.
     print("Exporting to aten dialect")
-    example_args = (randint(0, 100, (1, 100), dtype=long),)
+    batch_size = 1
+    vocab_size = 100
+    seq_len = 10
+    tokens = randint(0, vocab_size, (batch_size, seq_len), dtype=long)
+    example_args = (tokens,)
     with sdpa_kernel([SDPBackend.MATH]):
         aten_dialect: ExportedProgram = export(model, example_args)
 
@@ -80,7 +86,12 @@ def export_phi3_mini_lora_training(model) -> None:
     print("Exporting phi3-mini with LoRA for training")
     # 1. torch.export: Defines the program with the ATen operator set.
     print("Exporting to aten dialect")
-    example_args = (randint(0, 100, (1, 100), dtype=long),)
+    batch_size = 1
+    vocab_size = 100
+    seq_len = 10
+    tokens = randint(0, vocab_size, (batch_size, seq_len), dtype=long)
+    labels = tokens
+    example_args = (tokens, labels)
     with sdpa_kernel([SDPBackend.MATH]):
         exported_graph: ExportedProgram = export(model, example_args)
         print("Creating a joint forward-backwards graph for training")
diff --git a/examples/models/phi-3-mini-lora/install_requirements.sh b/examples/models/phi-3-mini-lora/install_requirements.sh
new file mode 100755
index 00000000000..c8aa428fe38
--- /dev/null
+++ b/examples/models/phi-3-mini-lora/install_requirements.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+pip install torchvision
+pip install torchtune
+pip install tiktoken
+
+# Install torchao.
+TORCHAO_VERSION=$(cat "$(dirname "$0")"/../../../.ci/docker/ci_commit_pins/torchao.txt)
+pip install --no-use-pep517 --user "git+https://github.com/pytorch/ao.git@${TORCHAO_VERSION}"
diff --git a/examples/models/phi-3-mini/CMakeLists.txt b/examples/models/phi-3-mini/CMakeLists.txt
index 39358e088e7..e1ffd0da055 100644
--- a/examples/models/phi-3-mini/CMakeLists.txt
+++ b/examples/models/phi-3-mini/CMakeLists.txt
@@ -23,6 +23,7 @@ set(CMAKE_BUILD_TYPE Release)
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
 option(EXECUTORCH_BUILD_XNNPACK "" ON)
 
@@ -47,6 +48,6 @@ target_include_directories(
   PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src
 )
 target_link_libraries(
-  phi_3_mini_runner PRIVATE executorch extension_module_static
+  phi_3_mini_runner PRIVATE executorch extension_module_static extension_tensor
                             optimized_native_cpu_ops_lib xnnpack_backend gflags
 )
diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md
index 6619a111a29..e5a86c41777 100644
--- a/examples/models/phi-3-mini/README.md
+++ b/examples/models/phi-3-mini/README.md
@@ -4,9 +4,9 @@ This example demonstrates how to run a [Phi-3-mini](https://huggingface.co/micro
 # Instructions
 ## Step 1: Setup
 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_requirements.sh --pybind xnnpack`
-2. To export Phi-3-mini, we need this [PR](https://github.com/huggingface/transformers/pull/32339). Install transformers from master with the following command:
+2. Currently, we support transformers v4.44.2. Install transformers with the following command:
 ```
-pip uninstall -y transformers ; pip install git+https://github.com/huggingface/transformers
+pip uninstall -y transformers ; pip install transformers==4.44.2
 ```
 ## Step 2: Prepare and run the model
 1. Download the `tokenizer.model` from HuggingFace and create `tokenizer.bin`.
@@ -26,8 +26,9 @@ python -m examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-m
      -DCMAKE_INSTALL_PREFIX=cmake-out \
      -DEXECUTORCH_ENABLE_LOGGING=1 \
      -DCMAKE_BUILD_TYPE=Release \
-     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
      -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
      -DEXECUTORCH_BUILD_XNNPACK=ON \
      -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
@@ -52,5 +53,14 @@ cmake --build cmake-out/examples/models/phi-3-mini -j16 --config Release
 ```
 - Run model. Options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/phi-3-mini/main.cpp#L13-L30)
 ```
-cmake-out/examples/models/phi-3-mini/phi_3_mini_runner --model_path=<model pte file> --tokenizer_path=<tokenizer.bin> --seq_len=128 --prompt=<prompt>
+cmake-out/examples/models/phi-3-mini/phi_3_mini_runner \
+    --model_path=phi-3-mini.pte \
+    --tokenizer_path=tokenizer.bin \
+    --seq_len=128 \
+    --temperature=0 \
+    --prompt="<|system|>
+You are a helpful assistant.<|end|>
+<|user|>
+What is the capital of France?<|end|>
+<|assistant|>"
 ```
diff --git a/examples/models/phi-3-mini/export_phi-3-mini.py b/examples/models/phi-3-mini/export_phi-3-mini.py
index ab5e04c3073..c2e97a21b1e 100644
--- a/examples/models/phi-3-mini/export_phi-3-mini.py
+++ b/examples/models/phi-3-mini/export_phi-3-mini.py
@@ -67,9 +67,9 @@ def export(args) -> None:
         model = capture_pre_autograd_graph(
             model, example_inputs, dynamic_shapes=dynamic_shapes
         )
-        model = prepare_pt2e(model, xnnpack_quantizer)
+        model = prepare_pt2e(model, xnnpack_quantizer)  # pyre-fixme[6]
         model(*example_inputs)
-        model = convert_pt2e(model, fold_quantize=False)
+        model = convert_pt2e(model)
         DuplicateDynamicQuantChainPass()(model)
         # TODO(lunwenh): update it to use export once
         # https://github.com/pytorch/pytorch/issues/128394 is resolved.
diff --git a/examples/models/phi3-mini-lora/install_requirements.sh b/examples/models/phi-3-mini/install_requirements.sh
old mode 100755
new mode 100644
similarity index 73%
rename from examples/models/phi3-mini-lora/install_requirements.sh
rename to examples/models/phi-3-mini/install_requirements.sh
index ab73d8dac40..b8ad5233100
--- a/examples/models/phi3-mini-lora/install_requirements.sh
+++ b/examples/models/phi-3-mini/install_requirements.sh
@@ -5,6 +5,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-pip install torchvision
-pip install torchtune
-pip install tiktoken
+set -x
+
+pip install transformers==4.44.2
+
+pip install sentencepiece
+
+pip list
diff --git a/examples/models/phi-3-mini/main.cpp b/examples/models/phi-3-mini/main.cpp
index 7aedcb75b28..86446a8bde3 100644
--- a/examples/models/phi-3-mini/main.cpp
+++ b/examples/models/phi-3-mini/main.cpp
@@ -42,7 +42,7 @@ int main(int32_t argc, char** argv) {
 
   int32_t seq_len = FLAGS_seq_len;
 
-  ::torch::executor::Runner runner(model_path, tokenizer_path, temperature);
+  example::Runner runner(model_path, tokenizer_path, temperature);
 
   runner.generate(prompt, seq_len);
 
diff --git a/examples/models/phi-3-mini/runner.cpp b/examples/models/phi-3-mini/runner.cpp
index a6cee57ea8f..ca299d3b11e 100644
--- a/examples/models/phi-3-mini/runner.cpp
+++ b/examples/models/phi-3-mini/runner.cpp
@@ -12,10 +12,16 @@
 #include <iostream>
 
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/platform/log.h>
 
-namespace torch::executor {
+using executorch::aten::ScalarType;
+using executorch::extension::Module;
+using executorch::extension::llm::BPETokenizer;
+using executorch::extension::llm::Sampler;
+using executorch::runtime::Error;
+
+namespace example {
 
 #define SAMPLER_TOP 0.9f
 #define ENDOFTEXT_TOKEN 32000
@@ -48,17 +54,8 @@ void Runner::generate(const std::string& prompt, std::size_t max_seq_len) {
   ET_CHECK_MSG(
       encode_res.error() == Error::Ok, "Failed to encode %s", prompt.c_str());
   auto input_tokens = encode_res.get();
-
-  std::cout << "Prefilling tokens ..." << std::endl;
-  for (auto token : input_tokens) {
-    std::cout << token << " ";
-  }
-  std::cout << std::endl;
-  std::cout.flush();
   auto prev_token = input_tokens.back();
   auto current_token = prefill(input_tokens);
-
-  std::cout << "Generating tokens ..." << std::endl;
   std::cout << tokenizer_->decode(prev_token, current_token).get();
   std::cout.flush();
 
@@ -81,23 +78,18 @@ uint64_t Runner::logits_to_token(const exec_aten::Tensor& logits_tensor) {
 }
 
 uint64_t Runner::prefill(std::vector<uint64_t>& tokens) {
-  ManagedTensor input_tokens(
+  auto result = module_->forward(executorch::extension::from_blob(
       tokens.data(),
       {1, static_cast<exec_aten::SizesType>(tokens.size())},
-      ScalarType::Long);
-  std::vector<EValue> inputs = {input_tokens.get_aliasing_tensor()};
-
-  auto result = module_->forward(inputs);
+      ScalarType::Long));
   ET_CHECK_MSG(result.error() == Error::Ok, "Failed to prefill tokens");
 
   return logits_to_token(result.get()[0].toTensor());
 }
 
 uint64_t Runner::run_model_step(uint64_t token) {
-  ManagedTensor input_token(&token, {1, 1}, ScalarType::Long);
-  std::vector<EValue> inputs = {input_token.get_aliasing_tensor()};
-
-  auto result = module_->forward(inputs);
+  auto result = module_->forward(
+      executorch::extension::from_blob(&token, {1, 1}, ScalarType::Long));
   ET_CHECK_MSG(
       result.error() == Error::Ok,
       "Failed to run forward() for token %" PRIu64,
@@ -106,4 +98,4 @@ uint64_t Runner::run_model_step(uint64_t token) {
   return logits_to_token(result.get()[0].toTensor());
 }
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/phi-3-mini/runner.h b/examples/models/phi-3-mini/runner.h
index 15022751a80..9b24f971708 100644
--- a/examples/models/phi-3-mini/runner.h
+++ b/examples/models/phi-3-mini/runner.h
@@ -19,7 +19,7 @@
 #include <executorch/extension/module/module.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 
-namespace torch::executor {
+namespace example {
 
 class Runner {
  public:
@@ -42,9 +42,9 @@ class Runner {
   uint64_t prefill(std::vector<uint64_t>& tokens);
   uint64_t run_model_step(uint64_t token);
 
-  std::unique_ptr<Module> module_;
-  std::unique_ptr<Tokenizer> tokenizer_;
-  std::unique_ptr<Sampler> sampler_;
+  std::unique_ptr<executorch::extension::Module> module_;
+  std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
+  std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
 };
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/test/test_export.py b/examples/models/test/test_export.py
index f258cc21391..b3030c24fea 100644
--- a/examples/models/test/test_export.py
+++ b/examples/models/test/test_export.py
@@ -29,7 +29,7 @@ def collect_executorch_and_eager_outputs(
         Returns a tuple containing the outputs of the eager mode model and the executorch mode model.
         """
         eager_model = eager_model.eval()
-        model = torch._export.capture_pre_autograd_graph(eager_model, example_inputs)
+        model = torch.export.export_for_training(eager_model, example_inputs).module()
         edge_model = export_to_edge(model, example_inputs)
 
         executorch_prog = edge_model.to_executorch()
diff --git a/examples/portable/custom_ops/custom_ops_1_out.cpp b/examples/portable/custom_ops/custom_ops_1_out.cpp
index 14c327071cf..c1f1dee0ceb 100644
--- a/examples/portable/custom_ops/custom_ops_1_out.cpp
+++ b/examples/portable/custom_ops/custom_ops_1_out.cpp
@@ -13,7 +13,7 @@ namespace native {
 
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 
 namespace {
 void check_preconditions(const Tensor& in, Tensor& out) {
@@ -35,10 +35,13 @@ void check_preconditions(const Tensor& in, Tensor& out) {
       ssize_t(in.numel()));
 }
 } // namespace
-// mul3.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)
-Tensor& mul3_out_impl(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  (void)ctx;
 
+// mul3.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)
+// ExecuTorch-compatible function signature, with a KernelRuntimeContext.
+Tensor& mul3_out_impl(
+    ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& in,
+    Tensor& out) {
   check_preconditions(in, out);
   float* out_data = out.mutable_data_ptr<float>();
   const float* in_data = in.const_data_ptr<float>();
@@ -47,5 +50,6 @@ Tensor& mul3_out_impl(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
   }
   return out;
 }
+
 } // namespace native
 } // namespace custom
diff --git a/examples/portable/custom_ops/custom_ops_2_out.cpp b/examples/portable/custom_ops/custom_ops_2_out.cpp
index ffb3a8be443..f792d06f29e 100644
--- a/examples/portable/custom_ops/custom_ops_2_out.cpp
+++ b/examples/portable/custom_ops/custom_ops_2_out.cpp
@@ -13,7 +13,7 @@ namespace native {
 
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using torch::executor::RuntimeContext;
+using executorch::runtime::KernelRuntimeContext;
 
 namespace {
 void check_preconditions(const Tensor& in, Tensor& out) {
@@ -35,7 +35,9 @@ void check_preconditions(const Tensor& in, Tensor& out) {
       ssize_t(in.numel()));
 }
 } // namespace
+
 // mul4.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)
+// ATen-compatible function signature, without a KernelRuntimeContext.
 Tensor& mul4_out_impl(const Tensor& in, Tensor& out) {
   check_preconditions(in, out);
   float* out_data = out.mutable_data_ptr<float>();
@@ -46,8 +48,12 @@ Tensor& mul4_out_impl(const Tensor& in, Tensor& out) {
   return out;
 }
 
-Tensor& mul4_out_impl(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  (void)ctx;
+// mul4.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)
+// ExecuTorch-compatible function signature, with a KernelRuntimeContext.
+Tensor& mul4_out_impl(
+    ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& in,
+    Tensor& out) {
   mul4_out_impl(in, out);
   return out;
 }
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index a0644487d23..93c150c0b90 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -37,11 +37,20 @@ DEFINE_string(
     "model.pte",
     "Model serialized in flatbuffer format.");
 
-using namespace torch::executor;
-using torch::executor::util::FileDataLoader;
+using executorch::extension::FileDataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
 
 int main(int argc, char** argv) {
-  runtime_init();
+  executorch::runtime::runtime_init();
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (argc != 1) {
@@ -154,7 +163,7 @@ int main(int argc, char** argv) {
   // Allocate input tensors and set all of their elements to 1. The `inputs`
   // variable owns the allocated memory and must live past the last call to
   // `execute()`.
-  auto inputs = util::prepare_input_tensors(*method);
+  auto inputs = executorch::extension::prepare_input_tensors(*method);
   ET_CHECK_MSG(
       inputs.ok(),
       "Could not prepare inputs: 0x%" PRIx32,
@@ -176,7 +185,7 @@ int main(int argc, char** argv) {
   status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
   // Print the first and last 100 elements of long lists of scalars.
-  std::cout << torch::executor::util::evalue_edge_items(100);
+  std::cout << executorch::extension::evalue_edge_items(100);
   for (int i = 0; i < outputs.size(); ++i) {
     std::cout << "Output " << i << ": " << outputs[i] << std::endl;
   }
diff --git a/examples/portable/scripts/export.py b/examples/portable/scripts/export.py
index 7849fa06ccd..6055ecef0f3 100644
--- a/examples/portable/scripts/export.py
+++ b/examples/portable/scripts/export.py
@@ -62,7 +62,7 @@ def main() -> None:
         *MODEL_NAME_TO_MODEL[args.model_name]
     )
 
-    backend_config = ExecutorchBackendConfig(extract_constant_segment=True)
+    backend_config = ExecutorchBackendConfig()
     if args.segment_alignment is not None:
         backend_config.segment_alignment = int(args.segment_alignment, 16)
     if (
diff --git a/examples/portable/scripts/export_and_delegate.py b/examples/portable/scripts/export_and_delegate.py
index 8d394537188..50f2ce6d901 100644
--- a/examples/portable/scripts/export_and_delegate.py
+++ b/examples/portable/scripts/export_and_delegate.py
@@ -15,8 +15,6 @@
     BackendWithCompilerDemo,
 )
 from executorch.exir.backend.test.op_partitioner_demo import AddMulPartitionerDemo
-from executorch.exir.capture._config import ExecutorchBackendConfig
-
 from executorch.extension.export_util import export_to_edge
 
 from ...models import MODEL_NAME_TO_MODEL
@@ -63,7 +61,7 @@ def export_composite_module_with_lower_graph():
     m_compile_spec = m.get_compile_spec()
 
     # pre-autograd export. eventually this will become torch.export
-    m = torch._export.capture_pre_autograd_graph(m, m_inputs)
+    m = torch.export.export_for_training(m, m_inputs).module()
     edge = export_to_edge(m, m_inputs)
     logging.info(f"Exported graph:\n{edge.exported_program().graph}")
 
@@ -86,7 +84,7 @@ def forward(self, *args):
     m = CompositeModule()
     m = m.eval()
     # pre-autograd export. eventually this will become torch.export
-    m = torch._export.capture_pre_autograd_graph(m, m_inputs)
+    m = torch.export.export_for_training(m, m_inputs).module()
     composited_edge = export_to_edge(m, m_inputs)
 
     # The graph module is still runnerable
@@ -94,9 +92,7 @@ def forward(self, *args):
 
     logging.info(f"Lowered graph:\n{composited_edge.exported_program().graph}")
 
-    exec_prog = composited_edge.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
-    )
+    exec_prog = composited_edge.to_executorch()
     buffer = exec_prog.buffer
 
     model_name = "composite_model"
@@ -138,7 +134,7 @@ def get_example_inputs(self):
     m = Model()
     m_inputs = m.get_example_inputs()
     # pre-autograd export. eventually this will become torch.export
-    m = torch._export.capture_pre_autograd_graph(m, m_inputs)
+    m = torch.export.export_for_training(m, m_inputs).module()
     edge = export_to_edge(m, m_inputs)
     logging.info(f"Exported graph:\n{edge.exported_program().graph}")
 
@@ -147,9 +143,7 @@ def get_example_inputs(self):
     edge = edge.to_backend(AddMulPartitionerDemo())
     logging.info(f"Lowered graph:\n{edge.exported_program().graph}")
 
-    exec_prog = edge.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
-    )
+    exec_prog = edge.to_executorch()
     buffer = exec_prog.buffer
 
     model_name = "partition_lowered_model"
@@ -177,7 +171,7 @@ def export_and_lower_the_whole_graph():
 
     m_inputs = m.get_example_inputs()
     # pre-autograd export. eventually this will become torch.export
-    m = torch._export.capture_pre_autograd_graph(m, m_inputs)
+    m = torch.export.export_for_training(m, m_inputs).module()
     edge = export_to_edge(m, m_inputs)
     logging.info(f"Exported graph:\n{edge.exported_program().graph}")
 
diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt
index fd9c1388b2d..542d7f03460 100644
--- a/examples/qualcomm/CMakeLists.txt
+++ b/examples/qualcomm/CMakeLists.txt
@@ -41,7 +41,7 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
 #
 set(EXECUTORCH_SRCS_FILE
-  "${CMAKE_CURRENT_BINARY_DIR}/../../executorch_srcs.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/../../executorch_srcs.cmake"
 )
 extract_sources(${EXECUTORCH_SRCS_FILE})
 include(${EXECUTORCH_SRCS_FILE})
@@ -50,7 +50,6 @@ get_filename_component(
   EXECUTORCH_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../.." ABSOLUTE
 )
 
-
 # portable_ops_lib
 gen_selected_ops(LIB_NAME "full_portable_ops_lib" INCLUDE_ALL_OPS "ON")
 generate_bindings_for_kernels(
@@ -68,16 +67,13 @@ target_include_directories(
 )
 
 # build qnn_executor_runner
-add_subdirectory(
-  ${CMAKE_CURRENT_SOURCE_DIR}/executor_runner
-)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/executor_runner)
 
 # build qnn_llama_runner
-add_subdirectory(
-  ${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama2
-)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama2)
 
 # build qaihub_llama2_7b_runner and qaihub_llama3_8b_runner
-add_subdirectory(
-  ${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama
-)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama)
+
+# build qaihub_stable_diffusion_runner
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/stable_diffusion)
diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md
index 3e7a018ac74..fa73c92ee11 100644
--- a/examples/qualcomm/README.md
+++ b/examples/qualcomm/README.md
@@ -53,12 +53,12 @@ cd $EXECUTORCH_ROOT/examples/qualcomm/scripts
 
 #### For MobileNet_v2
 ```bash
-python mobilenet_v2.py -s <device_serial> -m "SM8550" -b path/to/cmake-out-android/ -d /path/to/imagenet-mini/val
+python mobilenet_v2.py -s <device_serial> -m "SM8550" -b path/to/build-android/ -d /path/to/imagenet-mini/val
 ```
 
 #### For DeepLab_v3
 ```bash
-python deeplab_v3.py -s <device_serial> -m "SM8550" -b path/to/cmake-out-android/ --download
+python deeplab_v3.py -s <device_serial> -m "SM8550" -b path/to/build-android/ --download
 ```
 
 #### Check context binary version
diff --git a/examples/qualcomm/executor_runner/CMakeLists.txt b/examples/qualcomm/executor_runner/CMakeLists.txt
index 73106d9368f..214e0a58547 100644
--- a/examples/qualcomm/executor_runner/CMakeLists.txt
+++ b/examples/qualcomm/executor_runner/CMakeLists.txt
@@ -9,7 +9,9 @@ set(_qnn_executor_runner__srcs ${_executor_runner__srcs})
 # preprocess executor runner src files
 list(TRANSFORM _qnn_executor_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
 list(FILTER _qnn_executor_runner__srcs EXCLUDE REGEX ".*executor_runner.cpp$")
-list(PREPEND _qnn_executor_runner__srcs ${CMAKE_CURRENT_LIST_DIR}/qnn_executor_runner.cpp)
+list(PREPEND _qnn_executor_runner__srcs
+     ${CMAKE_CURRENT_LIST_DIR}/qnn_executor_runner.cpp
+)
 
 # build executor runner
 add_executable(qnn_executor_runner ${_qnn_executor_runner__srcs})
@@ -20,3 +22,6 @@ target_link_libraries(
   qnn_executor_runner qnn_executorch_backend full_portable_ops_lib etdump
   ${FLATCCRT_LIB} gflags
 )
+set_target_properties(
+  qnn_executor_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+)
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index 7cd3709b950..7235e36681e 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -18,16 +18,14 @@
  */
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
-#include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
-#include <executorch/util/util.h>
 
 #include <gflags/gflags.h>
 
@@ -41,10 +39,6 @@ DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");
-DEFINE_string(
-    prof_result_path,
-    "prof_result.bin",
-    "Executorch profiler output path.");
 DEFINE_string(
     output_folder_path,
     "outputs",
@@ -61,9 +55,40 @@ DEFINE_string(
     etdump_path,
     "etdump.etdp",
     "If etdump generation is enabled an etdump will be written out to this path");
-using namespace torch::executor;
-using torch::executor::MemoryAllocator;
-using torch::executor::util::FileDataLoader;
+
+DEFINE_bool(
+    dump_intermediate_outputs,
+    false,
+    "Dump intermediate outputs to etdump file.");
+
+DEFINE_string(
+    debug_output_path,
+    "debug_output.bin",
+    "Path to dump debug outputs to.");
+
+DEFINE_int32(
+    debug_buffer_size,
+    20000000, // 20MB
+    "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging.");
+
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::etdump::ETDumpGen;
+using executorch::etdump::ETDumpResult;
+using executorch::extension::FileDataLoader;
+using executorch::extension::prepare_input_tensors;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::EventTracerDebugLogLevel;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::TensorInfo;
 
 class CustomMemory {
  public:
@@ -102,7 +127,7 @@ class CustomMemory {
 };
 
 int main(int argc, char** argv) {
-  runtime_init();
+  executorch::runtime::runtime_init();
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (argc != 1) {
@@ -169,7 +194,6 @@ int main(int argc, char** argv) {
   // In this example we use a statically allocated memory pool.
   MemoryAllocator method_allocator{
       MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
-  method_allocator.enable_profiling("method allocator");
 
   // The memory-planned buffers will back the mutable tensors used by the
   // method. The sizes of these buffers were determined ahead of time during the
@@ -202,7 +226,7 @@ int main(int argc, char** argv) {
   // the method can mutate the memory-planned buffers, so the method should only
   // be used by a single thread at at time, but it can be reused.
   //
-  torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+  ETDumpGen etdump_gen;
   Result<Method> method =
       program->load_method(method_name, &memory_manager, &etdump_gen);
   ET_CHECK_MSG(
@@ -212,6 +236,15 @@ int main(int argc, char** argv) {
       method.error());
   ET_LOG(Info, "Method loaded.");
 
+  void* debug_buffer;
+  if (FLAGS_dump_intermediate_outputs) {
+    debug_buffer = malloc(FLAGS_debug_buffer_size);
+    Span<uint8_t> buffer((uint8_t*)debug_buffer, FLAGS_debug_buffer_size);
+    etdump_gen.set_debug_buffer(buffer);
+    etdump_gen.set_event_tracer_debug_level(
+        EventTracerDebugLogLevel::kIntermediateOutputs);
+  }
+
   // Prepare the inputs.
   // Allocate data memory for inputs and outputs
   std::vector<std::unique_ptr<CustomMemory>> in_custom_mem;
@@ -243,7 +276,7 @@ int main(int argc, char** argv) {
   }
   for (int output_index = 0; output_index < method->outputs_size();
        ++output_index) {
-    const exec_aten::Tensor& t = method->get_output(output_index).toTensor();
+    const Tensor& t = method->get_output(output_index).toTensor();
     out_custom_mem.push_back(
         std::make_unique<CustomMemory>(FLAGS_shared_buffer));
     std::unique_ptr<CustomMemory>& custom_mem_ptr = out_custom_mem.back();
@@ -387,14 +420,6 @@ int main(int argc, char** argv) {
         fout.close();
       }
 
-      // Dump the profiling data to the specified file.
-      torch::executor::prof_result_t prof_result;
-      EXECUTORCH_DUMP_PROFILE_RESULTS(&prof_result);
-      if (prof_result.num_bytes != 0) {
-        FILE* ptr = fopen(FLAGS_prof_result_path.c_str(), "w+");
-        fwrite(prof_result.prof_data, 1, prof_result.num_bytes, ptr);
-        fclose(ptr);
-      }
       ++inference_index;
     }
     ET_LOG(
@@ -405,7 +430,7 @@ int main(int argc, char** argv) {
         elapsed_time / inference_index);
   } else {
     // if no input is provided, fill the inputs with default values
-    auto inputs = util::prepare_input_tensors(*method);
+    auto inputs = prepare_input_tensors(*method);
     ET_CHECK_MSG(
         inputs.ok(),
         "Could not prepare inputs: 0x%" PRIx32,
@@ -424,7 +449,7 @@ int main(int argc, char** argv) {
 
   // Dump the etdump data containing profiling/debugging data to the specified
   // file.
-  etdump_result result = etdump_gen.get_etdump_data();
+  ETDumpResult result = etdump_gen.get_etdump_data();
   if (result.buf != nullptr && result.size > 0) {
     ET_LOG(
         Info,
@@ -437,5 +462,17 @@ int main(int argc, char** argv) {
     free(result.buf);
   }
 
+  if (FLAGS_dump_intermediate_outputs) {
+    ET_LOG(
+        Info,
+        "Write debug output binary to %s, Size = %zu",
+        FLAGS_debug_output_path.c_str(),
+        (size_t)FLAGS_debug_buffer_size);
+    FILE* f = fopen(FLAGS_debug_output_path.c_str(), "w+");
+    fwrite((uint8_t*)debug_buffer, 1, FLAGS_debug_buffer_size, f);
+    fclose(f);
+    free(debug_buffer);
+  }
+
   return 0;
 }
diff --git a/examples/qualcomm/oss_scripts/dino_v2.py b/examples/qualcomm/oss_scripts/dino_v2.py
index b3fecfbbe63..a8241e34a73 100644
--- a/examples/qualcomm/oss_scripts/dino_v2.py
+++ b/examples/qualcomm/oss_scripts/dino_v2.py
@@ -105,12 +105,6 @@ def main(args):
     if args.compile_only:
         sys.exit(0)
 
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # build_path : path where QNN delegate artifacts were built
-    # pte_path      : path where executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
diff --git a/examples/qualcomm/oss_scripts/esrgan.py b/examples/qualcomm/oss_scripts/esrgan.py
index 56871db7646..df02374e4bb 100644
--- a/examples/qualcomm/oss_scripts/esrgan.py
+++ b/examples/qualcomm/oss_scripts/esrgan.py
@@ -74,12 +74,6 @@ def main(args):
     if args.compile_only:
         sys.exit(0)
 
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # build_path : path where QNN delegate artifacts were built
-    # pte_path      : path where executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
diff --git a/examples/qualcomm/oss_scripts/gMLP_image_classification.py b/examples/qualcomm/oss_scripts/gMLP_image_classification.py
index 864a9b919fc..cbcd6d88cbf 100644
--- a/examples/qualcomm/oss_scripts/gMLP_image_classification.py
+++ b/examples/qualcomm/oss_scripts/gMLP_image_classification.py
@@ -96,12 +96,6 @@ def main(args):
     if args.compile_only:
         sys.exit(0)
 
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # build_path : path where artifacts were built
-    # pte_path      : path where QNN delegate executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
diff --git a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
index 2f13f017d3b..61a2ecda56b 100644
--- a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
+++ b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
@@ -9,25 +9,30 @@ set(_qnn_llama_runner__srcs ${_llama_runner__srcs})
 # preprocess qnn llama runner src files
 list(TRANSFORM _qnn_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
 list(FILTER _qnn_llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*")
-list(PREPEND _qnn_llama_runner__srcs
+list(
+  PREPEND
+  _qnn_llama_runner__srcs
   ${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h
 )
 
-
 # build qnn llama runner
 add_executable(qnn_llama_runner ${_qnn_llama_runner__srcs})
 target_include_directories(
   qnn_llama_runner PUBLIC ${_common_include_directories}
 )
-target_link_libraries(qnn_llama_runner
+target_link_libraries(
+  qnn_llama_runner
   qnn_executorch_backend
   full_portable_ops_lib
   extension_data_loader
   extension_module
+  extension_tensor
   gflags
+  re2::re2
 )
-target_compile_options(qnn_llama_runner
-  PUBLIC ${_common_compile_options}
+target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
+set_target_properties(
+  qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
 )
diff --git a/examples/qualcomm/oss_scripts/llama2/README.md b/examples/qualcomm/oss_scripts/llama2/README.md
index ec15545a6f5..d83902a6de8 100644
--- a/examples/qualcomm/oss_scripts/llama2/README.md
+++ b/examples/qualcomm/oss_scripts/llama2/README.md
@@ -32,7 +32,7 @@ echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps":
 Default example generates the story based on the given prompt, "Once".
 ```bash
 # 16a4w quant:
-python examples/qualcomm/oss_scripts/llama2/llama.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once"
+python examples/qualcomm/oss_scripts/llama2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once"
 ```
 
 #### (Note) Customized PTQ data set
diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py
index 087296b15bc..d74cfa0ef07 100644
--- a/examples/qualcomm/oss_scripts/llama2/llama.py
+++ b/examples/qualcomm/oss_scripts/llama2/llama.py
@@ -16,8 +16,7 @@
 from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
 from executorch.backends.qualcomm.passes.build_quant_io import BuildQuantIo
 
-from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer, QuantDtype
-from executorch.backends.qualcomm.quantizer.utils import get_16a4w_qnn_ptq_config
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
     QcomChipset,
 )
@@ -34,13 +33,13 @@
 )
 from executorch.examples.qualcomm.utils import (
     make_output_dir,
+    make_quantizer,
     setup_common_args_and_variables,
     SimpleADB,
 )
 from executorch.exir import EdgeCompileConfig, EdgeProgramManager
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
-from executorch.exir.program._program import _get_updated_graph_signature
 from executorch.extension.llm.export.builder import DType
 
 from sentencepiece import SentencePieceProcessor
@@ -274,20 +273,12 @@ def _tag_kv_ios(self, gm: torch.fx.GraphModule, kv_type):
 
     def quantize(self, quant_dtype, custom_annotations=()):
         self.quant_dtype = quant_dtype
-        quantizer = QnnQuantizer()
-        quantizer.set_per_channel_linear_quant(True)
-        quantizer.set_per_channel_conv_quant(True)
-
-        if quant_dtype == QuantDtype.use_8a8w:
-            pass  # default setting
-        elif quant_dtype == QuantDtype.use_16a4w:
-            quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS)
-            quantizer.set_bit16_op_quant_config(
-                get_16a4w_qnn_ptq_config(act_observer=MinMaxObserver)
-            )
-            quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4")
-        else:
-            raise AssertionError(f"No support for QuantDtype {quant_dtype}.")
+        quantizer = make_quantizer(
+            quant_dtype=quant_dtype,
+            per_channel_conv=True,
+            per_channel_linear=True,
+            act_observer=MinMaxObserver,
+        )
         quantizer.add_custom_quant_annotations(custom_annotations)
 
         self.has_quant_io = True
@@ -315,13 +306,11 @@ def lowering_modules(
             passes=[
                 BuildQuantIo(),
             ],
-            extract_constant_segment=False,
             # For shared buffer, user must pass the memory address
             # which is allocated by RPC memory to executor runner.
             # Therefore, won't want to pre-allocate
             # by memory manager in runtime.
             memory_planning_pass=MemoryPlanningPass(
-                memory_planning_algo="greedy",
                 alloc_graph_input=False,
                 alloc_graph_output=False,
             ),
@@ -368,6 +357,7 @@ def compile(args):
     )
     end_load_ts = time.time()
     print("torch.load checkpoint", end_load_ts - start_ts)
+
     llama_instance = None
     with torch.device("meta"):
         llama_instance = LlamaModel(config, output_new_cache_only=True)
@@ -384,16 +374,13 @@ def compile(args):
     for layer in llama_instance.layers:
         if getattr(layer.attention, "prepare_sha", None):
             layer.attention.prepare_sha()
-    kv_type = torch.uint8
-    if args.ptq == "8a8w":
-        quant_dtype = QuantDtype.use_8a8w
-    elif args.ptq == "16a4w":
-        quant_dtype = QuantDtype.use_16a4w
-    else:
-        raise AssertionError(
-            f"No support for quant type {args.ptq}. Support 8a8w and 16a4w."
-        )
 
+    kv_type = torch.uint8
+    assert args.ptq in [
+        "8a8w",
+        "16a4w",
+    ], f"No support for quant type {args.ptq}. Support 8a8w and 16a4w."
+    quant_dtype = getattr(QuantDtype, f"use_{args.ptq}")
     assert args.tokenizer_model is not None, "Need tokenizer model for calibration"
 
     if args.dtype_override is not None:
@@ -436,8 +423,6 @@ def inference(args, pre_gen_pte=""):
     runner_cmd = " ".join(
         [
             f"cd {workspace} &&",
-            "export ADSP_LIBRARY_PATH=. &&",
-            "export LD_LIBRARY_PATH=. &&",
             f"./qnn_llama_runner {runner_args}",
         ]
     )
diff --git a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
index 7340672c9ed..1e46f919dca 100644
--- a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
@@ -16,7 +16,6 @@
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama2/runner/runner.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/platform/log.h>
 
 #include <gflags/gflags.h>
@@ -24,8 +23,6 @@
 #include <fstream>
 #include <vector>
 
-using torch::executor::MemoryAllocator;
-
 DEFINE_string(
     model_path,
     "qnn_llama2.pte",
@@ -50,9 +47,12 @@ DEFINE_int32(
     128,
     "Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens.");
 
-int main(int argc, char** argv) {
-  using namespace torch::executor;
+using executorch::runtime::Error;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
 
+int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
@@ -61,7 +61,7 @@ int main(int argc, char** argv) {
   int32_t seq_len = FLAGS_seq_len;
 
   // create llama runner
-  Runner runner(FLAGS_model_path, tokenizer_path, temperature);
+  example::Runner runner(FLAGS_model_path, tokenizer_path, temperature);
   ET_CHECK_MSG(runner.load() == Error::Ok, "Runner failed to load method");
 
   // MethodMeta describes the memory requirements of the method.
diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
index d452336175f..2d37748fbe5 100644
--- a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
@@ -13,17 +13,34 @@
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/llm/runner/util.h>
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/platform/log.h>
 
 #include <ctime>
 #include <memory>
 #include <sstream>
 
-namespace torch {
-namespace executor {
+using executorch::aten::ScalarType;
+using executorch::aten::SizesType;
+using executorch::aten::Tensor;
+using executorch::extension::from_blob;
+using executorch::extension::Module;
+using executorch::extension::TensorPtr;
+using executorch::extension::llm::BPETokenizer;
+using executorch::extension::llm::Sampler;
+using executorch::extension::llm::time_in_ms;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+using executorch::runtime::TensorInfo;
+
+// TODO: Remove this usage of an internal-only function.
+using executorch::runtime::internal::set_tensor_data;
+
+namespace example {
 
 namespace {
 static constexpr auto kTopp = 0.9f;
@@ -56,7 +73,7 @@ Error Runner::load() {
   if (is_loaded()) {
     return Error::Ok;
   }
-  stats_.model_load_start_ms = util::time_in_ms();
+  stats_.model_load_start_ms = time_in_ms();
   ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
 
   // Read out metadata from the model
@@ -96,7 +113,7 @@ Error Runner::load() {
       temperature_,
       kTopp,
       static_cast<unsigned long long>(std::time(nullptr)));
-  stats_.model_load_end_ms = util::time_in_ms();
+  stats_.model_load_end_ms = time_in_ms();
 
   return Error::Ok;
 }
@@ -124,7 +141,7 @@ T Runner::getMetadataHelper(std::string method_name, T default_val) {
 }
 
 template <typename T>
-int32_t Runner::logitsToToken(const exec_aten::Tensor& logits_tensor) {
+int32_t Runner::logitsToToken(const Tensor& logits_tensor) {
   T* logits = logits_tensor.mutable_data_ptr<T>();
 
   // Since the logits are for all tokens, get the last token probabilities
@@ -134,42 +151,40 @@ int32_t Runner::logitsToToken(const exec_aten::Tensor& logits_tensor) {
 
 // Given an input token. Set up the inputs for the model and execute a single
 // step. Returning the logits tensor.
-Result<torch::executor::Tensor> Runner::run_model_step(
+Result<Tensor> Runner::run_model_step(
     int64_t input_token,
-    Tensor& token,
-    Tensor& start_pos,
-    Tensor& atten_mask,
-    std::vector<Tensor>& kv_tensors,
-    std::vector<Tensor>& kv_outputs) {
-  token.mutable_data_ptr<int32_t>()[0] = input_token;
+    TensorPtr& token,
+    TensorPtr& start_pos,
+    TensorPtr& atten_mask,
+    std::vector<TensorPtr>& kv_tensors,
+    std::vector<TensorPtr>& kv_outputs) {
+  token->mutable_data_ptr<int32_t>()[0] = input_token;
 
   // inputs:[tokens, start_pos, atten_mask, k_cache, v_cache]
-  std::vector<EValue> inputs = {token, start_pos, atten_mask};
-  inputs.insert(inputs.end(), kv_tensors.begin(), kv_tensors.end());
-  Result<std::vector<EValue>> outputs_res = module_->forward(inputs);
+  auto outputs_res = module_->forward({token, start_pos, atten_mask});
   ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
 
   // TODO: need to handle batch size != 1
-  size_t v_offset = kv_outputs[0].nbytes();
-  size_t el_size = kv_outputs[0].element_size();
+  size_t v_offset = kv_outputs[0]->nbytes();
+  size_t el_size = kv_outputs[0]->element_size();
   size_t k_input_step = (max_seq_len_ - 1) * el_size;
   int k_tensors_end = kv_tensors.size() / 2;
   // update k caches
   for (int j = 0; j < k_tensors_end; ++j) {
     uint8_t* input_addr =
-        static_cast<uint8_t*>(kv_tensors[j].mutable_data_ptr());
+        static_cast<uint8_t*>(kv_tensors[j]->mutable_data_ptr());
     uint8_t* output_addr =
-        static_cast<uint8_t*>(kv_outputs[j].mutable_data_ptr());
+        static_cast<uint8_t*>(kv_outputs[j]->mutable_data_ptr());
     // fill the output k values back
-    for (int src = 0, dst = k_input_step; src < kv_outputs[j].nbytes();
+    for (int src = 0, dst = k_input_step; src < kv_outputs[j]->nbytes();
          src += el_size, dst += k_input_step) {
       input_addr[dst] = output_addr[src];
     }
     char* new_inp_addr = io_mem_mgr_.update_k_caches_read(j, el_size);
     // inputs
     ET_CHECK_MSG(
-        internal::set_tensor_data(
-            kv_tensors[j], new_inp_addr, kv_tensors[j].nbytes()) == Error::Ok,
+        set_tensor_data(
+            *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok,
         "Failed to set input tensor when updating k_cache");
   }
   // update v caches
@@ -178,26 +193,26 @@ Result<torch::executor::Tensor> Runner::run_model_step(
     char* new_inp_addr = io_mem_mgr_.update_v_caches_read(v_idx, v_offset);
 
     ET_CHECK_MSG(
-        internal::set_tensor_data(
-            kv_tensors[j], new_inp_addr, kv_tensors[j].nbytes()) == Error::Ok,
+        set_tensor_data(
+            *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok,
         "Failed to set input tensor when updating v_cache");
     // outputs
     char* new_out_addr = io_mem_mgr_.update_v_caches_write(v_idx, v_offset);
     ET_CHECK_MSG(
-        internal::set_tensor_data(
-            kv_outputs[j], new_out_addr, kv_outputs[j].nbytes()) == Error::Ok,
+        set_tensor_data(
+            *kv_outputs[j], new_out_addr, kv_outputs[j]->nbytes()) == Error::Ok,
         "Failed to set output tensor when updating v_cache");
     ET_CHECK_MSG(
-        module_->set_output_data_ptr(kv_outputs[j], j + 1) == Error::Ok,
+        module_->set_output(*kv_outputs[j], j + 1) == Error::Ok,
         "Failed to set llama output data pointer");
   }
 
   // Bump start_pos by 1
-  start_pos.mutable_data_ptr<int32_t>()[0]++;
+  start_pos->mutable_data_ptr<int32_t>()[0]++;
 
   // update atten_mask
-  atten_mask.mutable_data_ptr<float>()
-      [atten_mask.numel() - 1 - start_pos.const_data_ptr<int32_t>()[0]] = 0;
+  atten_mask->mutable_data_ptr<float>()
+      [atten_mask->numel() - 1 - start_pos->const_data_ptr<int32_t>()[0]] = 0;
   return outputs_res.get()[0].toTensor();
 }
 // TODO: add overloaded method for on-device tokenize
@@ -211,7 +226,7 @@ Error Runner::generate(
 
   // First token time only measures the time it takes to encode the prompt and
   // return a response token.
-  stats_.inference_start_ms = util::time_in_ms();
+  stats_.inference_start_ms = time_in_ms();
   shouldStop_ = false;
 
   // Set the sequence length to the max seq length if not provided
@@ -236,36 +251,31 @@ Error Runner::generate(
       "Sequence length exceeded - please increase the seq_len value passed to generate()");
 
   int32_t pos = 0, prev_token, cur_token = prompt_tokens[0];
-  std::vector<exec_aten::SizesType> token_shape = {1, 1};
+  std::vector<SizesType> token_shape = {1, 1};
 
   io_mem_mgr_.get_input_token_ptr()[0] = 0;
-  std::vector<exec_aten::SizesType> start_pos_shape = {1, 1};
+  std::vector<SizesType> start_pos_shape = {1, 1};
 
   float* atten_mask_ptr =
       reinterpret_cast<float*>(io_mem_mgr_.get_atten_mask_ptr());
   std::fill(atten_mask_ptr, atten_mask_ptr + max_seq_len_, -255);
   atten_mask_ptr[max_seq_len_ - 1] = 0;
 
-  std::vector<exec_aten::SizesType> atten_mask_shape = {1, max_seq_len_};
+  std::vector<SizesType> atten_mask_shape = {1, max_seq_len_};
 
-  std::vector<exec_aten::SizesType> logits_data_shape = {1, vocab_size_};
+  std::vector<SizesType> logits_data_shape = {1, vocab_size_};
 
-  std::vector<exec_aten::SizesType> hidden_states_data_shape = {1, 1, dim_};
+  std::vector<SizesType> hidden_states_data_shape = {1, 1, dim_};
 
   // initialize tensor wrappers
-  ManagedTensor managed_token(
+  auto token = from_blob(
       io_mem_mgr_.get_input_token_ptr(), token_shape, ScalarType::Int);
-  ManagedTensor managed_pos_id(
+  auto start_pos = from_blob(
       io_mem_mgr_.get_pos_idx_ptr(), start_pos_shape, ScalarType::Int);
-  ManagedTensor managed_atten_mask(
+  auto atten_mask = from_blob(
       io_mem_mgr_.get_atten_mask_ptr(), atten_mask_shape, ScalarType::Float);
 
-  Tensor token = managed_token.get_aliasing_tensor();
-  Tensor atten_mask = managed_atten_mask.get_aliasing_tensor();
-  Tensor start_pos = managed_pos_id.get_aliasing_tensor();
-
-  std::vector<ManagedTensor> managed_kv_inputs, managed_kv_outputs;
-  std::vector<Tensor> kv_tensors, kv_outputs;
+  std::vector<TensorPtr> kv_tensors, kv_outputs;
 
   Result<MethodMeta> method_meta = get_method_meta();
   size_t num_inputs = method_meta->num_inputs();
@@ -280,26 +290,24 @@ Error Runner::generate(
         method_meta->input_tensor_meta(input_index);
 
     auto tensor_shape = tensor_meta->sizes();
-    std::vector<exec_aten::SizesType> sizes(
+    std::vector<SizesType> sizes(
         tensor_shape.data(), tensor_shape.data() + tensor_shape.size());
-    managed_kv_inputs.emplace_back(ManagedTensor(
+    kv_tensors.emplace_back(from_blob(
         io_mem_mgr_.get_k_caches_read_ptr(i),
         sizes,
         tensor_meta->scalar_type()));
-    kv_tensors.emplace_back(managed_kv_inputs.back().get_aliasing_tensor());
 
     // outpus
     Result<TensorInfo> out_tensor_meta = method_meta->output_tensor_meta(i + 1);
     tensor_shape = out_tensor_meta->sizes();
-    sizes = std::vector<exec_aten::SizesType>{
+    sizes = std::vector<SizesType>{
         tensor_shape.data(), tensor_shape.data() + tensor_shape.size()};
-    managed_kv_outputs.emplace_back(ManagedTensor(
+    kv_outputs.emplace_back(from_blob(
         io_mem_mgr_.get_k_caches_write_ptr(i),
         sizes,
-        kv_tensors.back().scalar_type()));
-    kv_outputs.emplace_back(managed_kv_outputs.back().get_aliasing_tensor());
+        kv_tensors.back()->scalar_type()));
     ET_CHECK_MSG(
-        module_->set_output_data_ptr(kv_outputs.back(), i + 1) == Error::Ok,
+        module_->set_output(kv_outputs.back(), i + 1) == Error::Ok,
         "Failed to set output tensor for kv cache");
   }
 
@@ -311,62 +319,57 @@ Error Runner::generate(
     Result<TensorInfo> tensor_meta =
         method_meta->input_tensor_meta(input_index);
     auto tensor_shape = tensor_meta->sizes();
-    std::vector<exec_aten::SizesType> sizes(
+    std::vector<SizesType> sizes(
         tensor_shape.data(), tensor_shape.data() + tensor_shape.size());
 
-    managed_kv_inputs.emplace_back(ManagedTensor(
+    kv_tensors.emplace_back(from_blob(
         io_mem_mgr_.get_v_caches_read_ptr(i),
         sizes,
         tensor_meta->scalar_type()));
-    kv_tensors.push_back(managed_kv_inputs.back().get_aliasing_tensor());
 
     // outputs
     Result<TensorInfo> out_tensor_meta =
         method_meta->output_tensor_meta(output_index);
     tensor_shape = out_tensor_meta->sizes();
-    sizes = std::vector<exec_aten::SizesType>{
+    sizes = std::vector<SizesType>{
         tensor_shape.data(), tensor_shape.data() + tensor_shape.size()};
 
-    managed_kv_outputs.push_back(ManagedTensor(
+    kv_outputs.push_back(from_blob(
         io_mem_mgr_.get_v_caches_write_ptr(i),
         sizes,
-        kv_tensors.back().scalar_type()));
-    kv_outputs.push_back(managed_kv_outputs.back().get_aliasing_tensor());
+        kv_tensors.back()->scalar_type()));
     ET_CHECK_MSG(
-        module_->set_output_data_ptr(kv_outputs.back(), output_index) ==
-            Error::Ok,
+        module_->set_output(kv_outputs.back(), output_index) == Error::Ok,
         "Failed to set output tensor for llama block");
   }
 
-  ManagedTensor affine_managed_logits(
+  auto affine_logits = from_blob(
       reinterpret_cast<float*>(io_mem_mgr_.get_logit_ptr()),
       logits_data_shape,
       ScalarType::Float);
-  Tensor affine_logits = affine_managed_logits.get_aliasing_tensor();
   ET_CHECK_MSG(
-      module_->set_output_data_ptr(affine_logits, 0) == Error::Ok,
+      module_->set_output(affine_logits) == Error::Ok,
       "Failed to set output tensor for affine module - logits");
 
   // Start consuming user's prompts and generating new tokens
   std::string final_output;
   while (pos < seq_len - 1) {
     // Run the model
-    Result<torch::executor::Tensor> logits_res = run_model_step(
+    auto logits_res = run_model_step(
         cur_token, token, start_pos, atten_mask, kv_tensors, kv_outputs);
     if (pos == num_prompt_tokens) {
-      stats_.first_token_ms = util::time_in_ms();
+      stats_.first_token_ms = time_in_ms();
     } else if (pos == num_prompt_tokens - 1) {
-      stats_.prompt_eval_end_ms = util::time_in_ms();
+      stats_.prompt_eval_end_ms = time_in_ms();
     }
 
     ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error());
-    exec_aten::Tensor& logits_tensor = logits_res.get();
+    Tensor& logits_tensor = logits_res.get();
     prev_token = cur_token;
-    long sample_start_time_ms = util::time_in_ms();
+    long sample_start_time_ms = time_in_ms();
 
     cur_token = logitsToToken<float>(logits_tensor);
-    stats_.aggregate_sampling_time_ms +=
-        util::time_in_ms() - sample_start_time_ms;
+    stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
 
     // advance the state machine
     if (pos < num_prompt_tokens - 1) {
@@ -393,7 +396,7 @@ Error Runner::generate(
       break;
     }
   }
-  stats_.inference_end_ms = util::time_in_ms();
+  stats_.inference_end_ms = time_in_ms();
 
   if (pos == seq_len) {
     ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
@@ -662,5 +665,4 @@ template bool Runner::getMetadataHelper<bool>(
     std::string method_name,
     bool default_val);
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.h b/examples/qualcomm/oss_scripts/llama2/runner/runner.h
index cdbb2cdd2e8..700cb94f52c 100644
--- a/examples/qualcomm/oss_scripts/llama2/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama2/runner/runner.h
@@ -21,7 +21,7 @@
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/extension/tensor/tensor.h>
 
 class RpcMemAllocator {
  public:
@@ -106,21 +106,20 @@ class RpcMemAllocator {
     return reinterpret_cast<char*>(ptr_) + name##_pos_[idx]; \
   }
 
-namespace torch {
-namespace executor {
+namespace example {
 class IoMemMgr {
  public:
   // Allocate a big memory which is capable to contain all IO of all modules
   IoMemMgr(){};
-  IoMemMgr(MethodMeta method_meta);
+  IoMemMgr(executorch::runtime::MethodMeta method_meta);
 
   struct InfoAttrs {
-    std::unique_ptr<TensorInfo> tensor_meta;
+    std::unique_ptr<executorch::runtime::TensorInfo> tensor_meta;
     size_t size = 0;
     std::vector<uint32_t> shape;
     uint32_t rank;
     size_t element_size;
-    torch::executor::ScalarType dtype;
+    executorch::aten::ScalarType dtype;
   };
 
   struct IoInfo {
@@ -186,15 +185,16 @@ class IoMemMgr {
   std::vector<size_t> v_caches_write_pos_;
 
   IoInfo io_info_;
-  std::unique_ptr<MethodMeta> method_meta_;
+  std::unique_ptr<executorch::runtime::MethodMeta> method_meta_;
   RpcMemAllocator rpc_mem_allocator{QnnMemDescriptor::kCustom};
-  std::unordered_map<ScalarType, size_t> scalar_type_to_size = {
-      {ScalarType::Int, sizeof(int32_t)},
-      {ScalarType::Float, sizeof(float)},
-      {ScalarType::Char, sizeof(int8_t)},
-      {ScalarType::Short, sizeof(int16_t)},
-      {ScalarType::Byte, sizeof(uint8_t)},
-      {ScalarType::Bits16, sizeof(uint16_t)},
+  std::unordered_map<executorch::aten::ScalarType, size_t> scalar_type_to_size =
+      {
+          {executorch::aten::ScalarType::Int, sizeof(int32_t)},
+          {executorch::aten::ScalarType::Float, sizeof(float)},
+          {executorch::aten::ScalarType::Char, sizeof(int8_t)},
+          {executorch::aten::ScalarType::Short, sizeof(int16_t)},
+          {executorch::aten::ScalarType::Byte, sizeof(uint8_t)},
+          {executorch::aten::ScalarType::Bits16, sizeof(uint16_t)},
   };
 };
 
@@ -232,29 +232,30 @@ class Runner {
   };
 
   bool is_loaded() const;
-  Error load();
-  Error mem_alloc(size_t alignment, size_t seq_len);
-  Error generate(
+  executorch::runtime::Error load();
+  executorch::runtime::Error mem_alloc(size_t alignment, size_t seq_len);
+  executorch::runtime::Error generate(
       const std::string& prompt,
       int32_t seq_len,
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const Stats&)> stats_callback = {});
   void stop();
-  Result<MethodMeta> get_method_meta();
+  executorch::runtime::Result<executorch::runtime::MethodMeta>
+  get_method_meta();
 
  private:
   // metadata
   template <typename T>
   T getMetadataHelper(std::string method_name, T default_val);
   template <typename T>
-  int32_t logitsToToken(const exec_aten::Tensor& logits_tensor);
-  Result<torch::executor::Tensor> run_model_step(
+  int32_t logitsToToken(const executorch::aten::Tensor& logits_tensor);
+  executorch::runtime::Result<executorch::aten::Tensor> run_model_step(
       int64_t input_token,
-      Tensor& token,
-      Tensor& start_pos,
-      Tensor& atten_mask,
-      std::vector<Tensor>& kv_tensors,
-      std::vector<Tensor>& kv_outputs);
+      ::executorch::extension::TensorPtr& token,
+      ::executorch::extension::TensorPtr& start_pos,
+      ::executorch::extension::TensorPtr& atten_mask,
+      std::vector<::executorch::extension::TensorPtr>& kv_tensors,
+      std::vector<::executorch::extension::TensorPtr>& kv_outputs);
   // metadata
   int32_t vocab_size_;
   int64_t bos_id_;
@@ -265,16 +266,15 @@ class Runner {
   int32_t head_dim_;
   int32_t dim_;
   std::unordered_set<std::string> model_methods_;
-  std::unique_ptr<Module> module_;
+  std::unique_ptr<executorch::extension::Module> module_;
   std::string tokenizer_path_;
   std::string model_path_;
   float temperature_;
-  std::unique_ptr<Tokenizer> tokenizer_;
-  std::unique_ptr<Sampler> sampler_;
+  std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
+  std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
   bool shouldStop_{false};
   Stats stats_;
   IoMemMgr io_mem_mgr_;
 };
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/regnet.py b/examples/qualcomm/oss_scripts/regnet.py
new file mode 100644
index 00000000000..0dc70608daf
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/regnet.py
@@ -0,0 +1,181 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+import sys
+from multiprocessing.connection import Client
+
+import numpy as np
+import torch
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    make_output_dir,
+    parse_skip_delegation_node,
+    setup_common_args_and_variables,
+    SimpleADB,
+    topk_accuracy,
+)
+
+from torchvision.models import (
+    regnet_x_400mf,
+    RegNet_X_400MF_Weights,
+    regnet_y_400mf,
+    RegNet_Y_400MF_Weights,
+)
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        for element in target:
+            targets.append(element)
+        input_list += f"input_{index}_0.raw\n"
+
+    return inputs, targets, input_list
+
+
+def main(args):
+    skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    if not args.compile_only and args.device is None:
+        raise RuntimeError(
+            "device serial is required if not compile only. "
+            "Please specify a device serial by -s/--device argument."
+        )
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+    )
+
+    if args.weights == "regnet_y_400mf":
+        weights = RegNet_Y_400MF_Weights.DEFAULT
+        model = regnet_y_400mf(weights=weights).eval()
+        pte_filename = "regnet_y_400mf"
+    else:
+        weights = RegNet_X_400MF_Weights.DEFAULT
+        model = regnet_x_400mf(weights=weights).eval()
+        pte_filename = "regnet_x_400mf"
+
+    build_executorch_binary(
+        model,
+        inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=QuantDtype.use_8a8w,
+    )
+
+    if args.compile_only:
+        sys.exit(0)
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=f"{args.artifact}/{pte_filename}.pte",
+        workspace=f"/data/local/tmp/executorch/{pte_filename}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+    )
+    adb.push(inputs=inputs, input_list=input_list)
+    adb.execute()
+
+    # collect output data
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    adb.pull(output_path=args.artifact)
+
+    # top-k analysis
+    predictions = []
+    for i in range(data_num):
+        predictions.append(
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+        )
+
+    k_val = [1, 5]
+    topk = [topk_accuracy(predictions, targets, k).item() for k in k_val]
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({f"top_{k}": topk[i] for i, k in enumerate(k_val)}))
+    else:
+        for i, k in enumerate(k_val):
+            print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./regnet",
+        default="./regnet",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "--weights",
+        type=str,
+        choices=["regnet_y_400mf", "regnet_x_400mf"],
+        help="Specify which regent weights/model to execute",
+        required=True,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/squeezenet.py b/examples/qualcomm/oss_scripts/squeezenet.py
index 820f23d1193..64b317068ce 100644
--- a/examples/qualcomm/oss_scripts/squeezenet.py
+++ b/examples/qualcomm/oss_scripts/squeezenet.py
@@ -92,12 +92,6 @@ def main(args):
     if args.compile_only:
         sys.exit(0)
 
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # build_path : path where QNN delegate artifacts were built
-    # pte_path      : path where executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
diff --git a/examples/qualcomm/oss_scripts/ssd300_vgg16.py b/examples/qualcomm/oss_scripts/ssd300_vgg16.py
index bd5089441ed..a5db138233e 100644
--- a/examples/qualcomm/oss_scripts/ssd300_vgg16.py
+++ b/examples/qualcomm/oss_scripts/ssd300_vgg16.py
@@ -28,7 +28,6 @@ def create_data_lists(voc07_path, data_size):
     Create lists of images, the bounding boxes and labels of the objects in these images, and save these to file.
 
     :param voc07_path: path to the 'VOC2007' folder
-    :param output_folder: folder where the JSONs must be saved
     """
     from utils import parse_annotation
 
@@ -156,12 +155,6 @@ def main(args):
     if args.compile_only:
         sys.exit(0)
 
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # build_path : path where QNN delegate artifacts were built
-    # pte_path      : path where executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
index d2cbbc183cb..6d28976a57a 100644
--- a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
+++ b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
@@ -6,45 +6,51 @@
 
 # preprocess qaihub runner src files for llama2,3
 set(_qaihub_llama_runner__srcs ${_llama_runner__srcs})
-list(TRANSFORM _qaihub_llama_runner__srcs  PREPEND "${EXECUTORCH_SOURCE_DIR}/")
-list(FILTER _qaihub_llama_runner__srcs  EXCLUDE REGEX ".*(/runner/).*")
-list(PREPEND _qaihub_llama_runner__srcs
+list(TRANSFORM _qaihub_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
+list(FILTER _qaihub_llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*")
+list(
+  PREPEND
+  _qaihub_llama_runner__srcs
   ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h
   ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.h
 )
 
-
 # preprocess qaihub llama2 7b runner src files
 set(_qaihub_llama2_7b_runner__srcs ${_qaihub_llama_runner__srcs})
 
 list(PREPEND _qaihub_llama2_7b_runner__srcs
-  ${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_llama2_7b_runner.cpp
+     ${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_llama2_7b_runner.cpp
 )
 
 # build qaihub llama2 7b runner
 add_executable(qaihub_llama2_7b_runner ${_qaihub_llama2_7b_runner__srcs})
-target_include_directories(qaihub_llama2_7b_runner
-  PUBLIC ${_common_include_directories}
+target_include_directories(
+  qaihub_llama2_7b_runner PUBLIC ${_common_include_directories}
 )
-target_link_libraries(qaihub_llama2_7b_runner
+target_link_libraries(
+  qaihub_llama2_7b_runner
   qnn_executorch_backend
   executorch_no_prim_ops
   extension_data_loader
   extension_module
+  extension_tensor
   gflags
+  re2::re2
 )
-target_compile_options(qaihub_llama2_7b_runner
-  PUBLIC ${_common_compile_options}
+target_compile_options(
+  qaihub_llama2_7b_runner PUBLIC ${_common_compile_options}
+)
+set_target_properties(
+  qaihub_llama2_7b_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
 )
-
 
 # preprocess qaihub llama3 8b runner src files
 set(_qaihub_llama3_8b_runner__srcs ${_qaihub_llama_runner__srcs})
 
 list(PREPEND _qaihub_llama3_8b_runner__srcs
-  ${CMAKE_CURRENT_LIST_DIR}/llama3/qaihub_llama3_8b_runner.cpp
+     ${CMAKE_CURRENT_LIST_DIR}/llama3/qaihub_llama3_8b_runner.cpp
 )
 
 # Adding a compile option to differentiate llama2 with llama3 logic
@@ -65,30 +71,35 @@ add_subdirectory(
 )
 set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
 
-
-list(APPEND _qaihub_llama3_8b_runner__srcs
+list(
+  APPEND _qaihub_llama3_8b_runner__srcs
   ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
 )
-list(APPEND _qaihub_llama3_8b_runner__srcs
+list(
+  APPEND
+  _qaihub_llama3_8b_runner__srcs
   ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama2/tokenizer/llama_tiktoken.cpp
 )
-set(_preprocessor_flag -DET_USE_TIKTOKEN)
-
 
 # build qaihub llama3 8b runner
 add_executable(qaihub_llama3_8b_runner ${_qaihub_llama3_8b_runner__srcs})
-target_include_directories(qaihub_llama3_8b_runner
-  PUBLIC ${_common_include_directories}
+target_include_directories(
+  qaihub_llama3_8b_runner PUBLIC ${_common_include_directories}
 )
 
-target_link_libraries(qaihub_llama3_8b_runner
+target_link_libraries(
+  qaihub_llama3_8b_runner
   qnn_executorch_backend
   executorch_no_prim_ops
   extension_data_loader
   extension_module
+  extension_tensor
   gflags
   re2::re2
 )
-target_compile_options(qaihub_llama3_8b_runner
-  PUBLIC ${_common_compile_options}
+target_compile_options(
+  qaihub_llama3_8b_runner PUBLIC ${_common_compile_options}
+)
+set_target_properties(
+  qaihub_llama3_8b_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
 )
diff --git a/examples/qualcomm/qaihub_scripts/llama/README.md b/examples/qualcomm/qaihub_scripts/llama/README.md
index d7c5f80d334..d49ca4cc946 100644
--- a/examples/qualcomm/qaihub_scripts/llama/README.md
+++ b/examples/qualcomm/qaihub_scripts/llama/README.md
@@ -27,7 +27,7 @@ python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o token
 #### Step3: Run default examples
 ```bash
 # AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v2_7b_chat_quantized
-python examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_bin tokenizer.bin --prompt "What is Python?"
+python examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_bin tokenizer.bin --prompt "What is Python?"
 ```
 
 ## Llama-3-8b-chat-hf
@@ -48,5 +48,5 @@ Note that the pre-compiled context binaries could not be futher fine-tuned for o
 #### Step3: Run default examples
 ```bash
 # AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v3_8b_chat_quantized
-python examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_model tokenizer.model --prompt "What is baseball?"
+python examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_model tokenizer.model --prompt "What is baseball?"
 ```
\ No newline at end of file
diff --git a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py
index d680c973d35..c54b75a6b6a 100644
--- a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py
+++ b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import gc
 import json
 import os
 from multiprocessing.connection import Client
@@ -15,18 +14,19 @@
     QcomChipset,
 )
 from executorch.backends.qualcomm.utils.utils import (
-    canonicalize_program,
     from_context_binary,
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
     generate_qnn_executorch_option,
 )
+from executorch.examples.qualcomm.qaihub_scripts.utils.utils import (
+    gen_pte_from_ctx_bin,
+    get_encoding,
+)
 from executorch.examples.qualcomm.utils import (
     setup_common_args_and_variables,
     SimpleADB,
 )
-from executorch.exir.backend.backend_api import to_backend
-from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 
 
 def main(args):
@@ -55,45 +55,25 @@ def main(args):
         is_from_context_binary=True,
     )
 
-    pte_name = (
-        "qaihub_llama2_7b_prompt"
-        if args.use_prompt_processor
-        else "qaihub_llama2_7b_token"
-    )
+    if args.use_prompt_processor:
+        pte_name = "qaihub_llama2_7b_prompt"
+        last_shard_num_inputs = 4
+        last_shard_num_outputs = 513
+    else:
+        pte_name = "qaihub_llama2_7b_token"
+        last_shard_num_inputs = 516
+        last_shard_num_outputs = 513
+
     if args.pre_gen_pte is None:
         # create custom operators as context loader
         bundle_programs = [
             from_context_binary(f"{args.context_binaries}/{target}", f"ctx_loader_{i}")
             for i, target in enumerate(target_names)
         ]
-        # lower with QnnBackend
-        lowered_modules = [
-            to_backend("QnnBackend", prog["edge_program"], compiler_specs)
-            for prog in bundle_programs
-        ]
-        # setup spill-fill buffer for relieving runtime memory usage
-        canonicalize_program(lowered_modules)
-        # export pte files
-        pte_files = []
-        for i in range(len(target_names)):
-            print(f"pte {i} generating...")
-            memory_planning_pass = MemoryPlanningPass(
-                memory_planning_algo="greedy",
-                alloc_graph_input=False,
-                alloc_graph_output=False,
-            )
-            pte_files.append(f"{args.artifact}/{pte_name}_{i}.pte")
-            with open(pte_files[-1], "wb") as file:
-                file.write(
-                    lowered_modules[0].buffer(
-                        extract_delegate_segments=True,
-                        memory_planning=memory_planning_pass,
-                    )
-                )
-            # gc for reducing host memory consuming
-            bundle_programs.pop(0)
-            lowered_modules.pop(0)
-            gc.collect()
+        pte_names = [f"{pte_name}_{i}" for i in range(len(target_names))]
+        pte_files = gen_pte_from_ctx_bin(
+            args.artifact, pte_names, compiler_specs, bundle_programs
+        )
     else:
         pte_files = [f"{args.pre_gen_pte}/{pte_name}_{i}.pte" for i in range(4)]
 
@@ -125,7 +105,16 @@ def get_logit_encoding(path_to_last_shard: str):
     )
     output_file = "result.txt"
     pos_embs_file = ["freq_cos", "freq_sin"]
-    scale, offset = get_logit_encoding(target_names[-1])
+    encoding = get_encoding(
+        path_to_shard=f"{args.context_binaries}/{target_names[-1]}",
+        compiler_specs=compiler_specs,
+        get_input=False,
+        get_output=True,
+        num_input=last_shard_num_inputs,
+        num_output=last_shard_num_outputs,
+    )[0]
+    scale = encoding["scale"][-1]
+    offset = encoding["offset"][-1]
     outputs = []
     runner_args = [
         *[
@@ -145,8 +134,6 @@ def get_logit_encoding(path_to_last_shard: str):
     runner_cmds = " ".join(
         [
             f"cd {adb.workspace} &&",
-            "export ADSP_LIBRARY_PATH=. &&",
-            "export LD_LIBRARY_PATH=. &&",
             f"./qaihub_llama2_7b_runner {' '.join(runner_args)}",
         ]
     )
@@ -173,7 +160,8 @@ def post_process():
         freq = (freq / scale + offset).clip(min=0, max=65535).detach()
         freq.to(dtype=torch.uint16).numpy().tofile(custom_files[-1])
 
-    adb.push(files=custom_files)
+    if not args.skip_push:
+        adb.push(files=custom_files)
     adb.execute(custom_runner_cmd=runner_cmds)
     adb.pull(args.artifact, callback=post_process)
     if args.ip and args.port != -1:
@@ -230,7 +218,7 @@ def post_process():
     parser.add_argument(
         "--temperature",
         help="sampling temperature for llama2",
-        default=0.8,
+        default=0.0,
         type=float,
     )
 
diff --git a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp
index 3aabdb93091..3de97cde7e8 100644
--- a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp
@@ -16,7 +16,6 @@
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/runner.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/platform/log.h>
 
 #include <gflags/gflags.h>
@@ -36,8 +35,8 @@ DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
 DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
 DEFINE_double(
     temperature,
-    0.8f,
-    "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
+    0.0f,
+    "Temperature; Default is 0.0f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
 DEFINE_int32(
     eval_mode,
     0,
@@ -50,8 +49,6 @@ DEFINE_double(logits_scale, 0.0, "Path to logits scale file");
 DEFINE_int32(logits_offset, 0, "Path to logits offset file");
 
 int main(int argc, char** argv) {
-  using namespace torch::executor;
-
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   std::vector<std::string> models_path = {
@@ -63,7 +60,7 @@ int main(int argc, char** argv) {
       FLAGS_freq_cos_path, FLAGS_freq_sin_path};
 
   // create llama runner
-  Runner runner(
+  example::Runner runner(
       models_path,
       pos_embs_path,
       {8, 8, 8, 8},
@@ -75,9 +72,10 @@ int main(int argc, char** argv) {
 
   // generate tokens & store inference output
   std::ofstream fout(FLAGS_output_path.c_str());
-  runner.generate(FLAGS_prompt, FLAGS_seq_len, [&](const std::string& piece) {
-    fout << piece;
-  });
+  runner.generate(
+      FLAGS_prompt, "", FLAGS_seq_len, [&](const std::string& piece) {
+        fout << piece;
+      });
   fout.close();
   return 0;
 }
diff --git a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py
index 9e4f3caf661..9acbeebef2d 100644
--- a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py
+++ b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py
@@ -4,30 +4,28 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import gc
 import json
 import os
 from multiprocessing.connection import Client
 
-import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor
-
 import torch
 from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (  # noqa: F401
     QcomChipset,
 )
+
 from executorch.backends.qualcomm.utils.utils import (
-    canonicalize_program,
     from_context_binary,
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
-    generate_qnn_executorch_option,
+)
+from executorch.examples.qualcomm.qaihub_scripts.utils.utils import (
+    gen_pte_from_ctx_bin,
+    get_encoding,
 )
 from executorch.examples.qualcomm.utils import (
     setup_common_args_and_variables,
     SimpleADB,
 )
-from executorch.exir.backend.backend_api import to_backend
-from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 
 
 def main(args):
@@ -56,67 +54,33 @@ def main(args):
         is_from_context_binary=True,
     )
 
-    pte_name = (
-        "qaihub_llama3_8b_prompt"
-        if args.use_prompt_processor
-        else "qaihub_llama3_8b_token"
-    )
+    if args.use_prompt_processor:
+        pte_name = "qaihub_llama3_8b_prompt"
+        last_shard_num_inputs = 4
+        last_shard_num_outputs = 65
+        custom_spill_fill = 128974848
+    else:
+        pte_name = "qaihub_llama3_8b_token"
+        last_shard_num_inputs = 68
+        last_shard_num_outputs = 65
+        custom_spill_fill = 3932160
+
     if args.pre_gen_pte is None:
         # create custom operators as context loader
         bundle_programs = [
             from_context_binary(f"{args.context_binaries}/{target}", f"ctx_loader_{i}")
             for i, target in enumerate(target_names)
         ]
-        # lower with QnnBackend
-        lowered_modules = [
-            to_backend("QnnBackend", prog["edge_program"], compiler_specs)
-            for prog in bundle_programs
-        ]
-        # TODO: QNN seems to have an expected spill fill size that can be found through log.
-        # Find a way to set this value instead of manually go through the log to retrieve the value.
-        custom_spill_fill = 128974848 if args.use_prompt_processor else 3932160
-        # setup spill-fill buffer for relieving runtime memory usage
-        canonicalize_program(lowered_modules, custom_buffer_size=custom_spill_fill)
-        # export pte files
-        pte_files = []
-        for i in range(len(target_names)):
-            print(f"pte {i} generating...")
-            memory_planning_pass = MemoryPlanningPass(
-                memory_planning_algo="greedy",
-                alloc_graph_input=False,
-                alloc_graph_output=False,
-            )
-            pte_files.append(f"{args.artifact}/{pte_name}_{i}.pte")
-            with open(pte_files[-1], "wb") as file:
-                file.write(
-                    lowered_modules[0].buffer(
-                        extract_delegate_segments=True,
-                        memory_planning=memory_planning_pass,
-                    )
-                )
-            # gc for reducing host memory consuming
-            bundle_programs.pop(0)
-            lowered_modules.pop(0)
-            gc.collect()
+        pte_names = [f"{pte_name}_{i}" for i in range(len(target_names))]
+        pte_files = gen_pte_from_ctx_bin(
+            args.artifact, pte_names, compiler_specs, bundle_programs, custom_spill_fill
+        )
     else:
         pte_files = [f"{args.pre_gen_pte}/{pte_name}_{i}.pte" for i in range(5)]
 
     if args.compile_only:
         return
 
-    def get_logit_encoding(path_to_last_shard: str):
-        with open(f"{args.context_binaries}/{path_to_last_shard}", "rb") as f:
-            ctx_bin = f.read()
-            qnn_mgr = PyQnnManagerAdaptor.QnnManager(
-                generate_qnn_executorch_option(compiler_specs), ctx_bin
-            )
-            assert qnn_mgr.Init().value == 0, "failed to load context binary"
-            qnn_mgr.AllocateTensor()
-            logits = qnn_mgr.GetGraphOutputs()[-1]
-            encoding = logits.GetEncodings()
-            qnn_mgr.Destroy()
-            return encoding.data["scale"].item(), encoding.data["offset"].item()
-
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=args.build_folder,
@@ -129,7 +93,17 @@ def get_logit_encoding(path_to_last_shard: str):
     )
     output_file = "result.txt"
     pos_embs_file = ["freq_cos", "freq_sin"]
-    scale, offset = get_logit_encoding(target_names[-1])
+
+    encoding = get_encoding(
+        path_to_shard=f"{args.context_binaries}/{target_names[-1]}",
+        compiler_specs=compiler_specs,
+        get_input=False,
+        get_output=True,
+        num_input=last_shard_num_inputs,
+        num_output=last_shard_num_outputs,
+    )[0]
+    scale = encoding["scale"][-1]
+    offset = encoding["offset"][-1]
     outputs = []
     runner_args = [
         *[
@@ -145,12 +119,11 @@ def get_logit_encoding(path_to_last_shard: str):
         f"--eval_mode {0 if args.use_prompt_processor else 1}",
         f"--logits_scale {scale}",
         f"--logits_offset {-offset}",
+        f"--system_prompt '{args.system_prompt}'",
     ]
     runner_cmds = " ".join(
         [
             f"cd {adb.workspace} &&",
-            "export ADSP_LIBRARY_PATH=. &&",
-            "export LD_LIBRARY_PATH=. &&",
             f"./qaihub_llama3_8b_runner {' '.join(runner_args)}",
         ]
     )
@@ -177,7 +150,8 @@ def post_process():
         freq = (freq / scale + offset).clip(min=0, max=65535).detach()
         freq.to(dtype=torch.uint16).numpy().tofile(custom_files[-1])
 
-    adb.push(files=custom_files)
+    if not args.skip_push:
+        adb.push(files=custom_files)
     adb.execute(custom_runner_cmd=runner_cmds)
     adb.pull(args.artifact, callback=post_process)
     if args.ip and args.port != -1:
@@ -234,7 +208,7 @@ def post_process():
     parser.add_argument(
         "--temperature",
         help="sampling temperature for llama3",
-        default=0.8,
+        default=0.0,
         type=float,
     )
 
@@ -245,6 +219,13 @@ def post_process():
         type=str,
     )
 
+    parser.add_argument(
+        "--system_prompt",
+        help="Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None",
+        default="",
+        type=str,
+    )
+
     parser.add_argument(
         "--pre_gen_pte",
         help="folder path to pre-compiled ptes",
diff --git a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp
index d5c2208c386..7591b7ae1e9 100644
--- a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp
@@ -16,7 +16,6 @@
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/runner.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/platform/log.h>
 
 #include <gflags/gflags.h>
@@ -35,10 +34,14 @@ DEFINE_string(freq_sin_path, "", "Path to precomputed position embeddings");
 DEFINE_string(output_path, "outputs", "Executorch inference data output path.");
 DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
 DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
+DEFINE_string(
+    system_prompt,
+    "",
+    "Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None");
 DEFINE_double(
     temperature,
-    0.8f,
-    "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
+    0.0f,
+    "Temperature; Default is 0.0f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
 DEFINE_int32(
     eval_mode,
     0,
@@ -51,8 +54,6 @@ DEFINE_double(logits_scale, 0.0, "Path to logits scale file");
 DEFINE_int32(logits_offset, 0, "Path to logits offset file");
 
 int main(int argc, char** argv) {
-  using namespace torch::executor;
-
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   std::vector<std::string> models_path = {
@@ -65,7 +66,7 @@ int main(int argc, char** argv) {
       FLAGS_freq_cos_path, FLAGS_freq_sin_path};
 
   // create llama runner
-  Runner runner(
+  example::Runner runner(
       models_path,
       pos_embs_path,
       {4, 8, 8, 8, 4},
@@ -77,9 +78,11 @@ int main(int argc, char** argv) {
 
   // generate tokens & store inference output
   std::ofstream fout(FLAGS_output_path.c_str());
-  runner.generate(FLAGS_prompt, FLAGS_seq_len, [&](const std::string& piece) {
-    fout << piece;
-  });
+  runner.generate(
+      FLAGS_prompt,
+      FLAGS_system_prompt,
+      FLAGS_seq_len,
+      [&](const std::string& piece) { fout << piece; });
   fout.close();
   return 0;
 }
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp
index 3283d81a9f3..9dc1ee7e254 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp
@@ -6,13 +6,21 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <algorithm>
 #include <fstream>
 
 #include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
-namespace torch {
-namespace executor {
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::extension::Module;
+using executorch::runtime::Error;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+using executorch::runtime::TensorInfo;
+
+namespace example {
 
 Memory::Memory(
     const std::vector<std::string>& pos_embs_path,
@@ -427,7 +435,7 @@ void KVCachedMemory::update_io(
             // k, v are placed interleaved
             int index = (cache_stride << 1) + (cache_group << 5) + head;
             ET_CHECK_MSG(
-                modules_[shard]->set_output_data_ptr(
+                modules_[shard]->set_output(
                     output_tensors[shard][index], index) == Error::Ok,
                 "failed to set output tensor for module %d's %d'th output "
                 "while updating kv_cache output tensors",
@@ -450,8 +458,8 @@ void KVCachedMemory::update_io(
     for (int shard = 0; shard < output_tensors.size(); shard++) {
       for (int index = 0; index < output_tensors[shard].size(); index++) {
         ET_CHECK_MSG(
-            modules_[shard]->set_output_data_ptr(
-                output_tensors[shard][index], index) == Error::Ok,
+            modules_[shard]->set_output(output_tensors[shard][index], index) ==
+                Error::Ok,
             "failed to set output tensor for module %d's %d'th output "
             "while updating kv_cache output tensors",
             shard,
@@ -476,7 +484,7 @@ void KVCachedMemory::update_io(
 ThreadPool::ThreadPool() : stop_(false) {
   size_t hc = (std::thread::hardware_concurrency() + 3) / 4;
   // maximum number should be divisible by head dimension which equals to 32
-  num_workers_ = min(32, hc * 4);
+  num_workers_ = std::min<size_t>(32, hc * 4);
   for (size_t i = 0; i < num_workers_; ++i) {
     threads_.emplace_back([this]() {
       while (1) {
@@ -520,5 +528,4 @@ size_t ThreadPool::num_workers() {
   return num_workers_;
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h
index df64bf8263d..4ad7264cc91 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h
@@ -26,44 +26,47 @@
 #define QAIHUB_LLAMA_LOGITS 32000
 #endif
 
-namespace torch {
-namespace executor {
+namespace example {
 
 class Memory {
  public:
   Memory(
       const std::vector<std::string>& pos_embs_path,
-      std::vector<std::shared_ptr<Module>>& modules);
+      std::vector<std::shared_ptr<executorch::extension::Module>>& modules);
   virtual ~Memory();
   virtual void prepare_io(
-      const std::vector<Result<MethodMeta>>& methods_meta) = 0;
+      const std::vector<
+          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
+          methods_meta) = 0;
   virtual void update_io(
       int64_t cur_token,
       int64_t pos,
-      std::vector<std::vector<Tensor>>& output_tensors) = 0;
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
   void* get_mutable_ptr();
-  std::vector<Tensor> get_input_tensors(int shard_index);
-  std::vector<Tensor> get_output_tensors(int shard_index);
+  std::vector<executorch::aten::Tensor> get_input_tensors(int shard_index);
+  std::vector<executorch::aten::Tensor> get_output_tensors(int shard_index);
 
  protected:
   std::unique_ptr<void, void (*)(void*)> data_ptr_;
-  std::vector<std::vector<TensorImpl*>> input_tensors_;
-  std::vector<std::vector<TensorImpl*>> output_tensors_;
+  std::vector<std::vector<executorch::aten::TensorImpl*>> input_tensors_;
+  std::vector<std::vector<executorch::aten::TensorImpl*>> output_tensors_;
   std::vector<std::string> pos_embs_path_;
-  std::vector<std::shared_ptr<Module>> modules_;
+  std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
 };
 
 class BertMemory : public Memory {
  public:
   BertMemory(
       const std::vector<std::string>& pos_embs_path,
-      std::vector<std::shared_ptr<Module>>& modules,
+      std::vector<std::shared_ptr<executorch::extension::Module>>& modules,
       std::vector<int> shard_layers);
-  void prepare_io(const std::vector<Result<MethodMeta>>& methods_meta) override;
+  void prepare_io(const std::vector<executorch::runtime::Result<
+                      executorch::runtime::MethodMeta>>& methods_meta) override;
   void update_io(
       int64_t cur_token,
       int64_t pos,
-      std::vector<std::vector<Tensor>>& output_tensors) override;
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
+      override;
   struct IO {
     int32_t input_ids[1024 * 2];
     uint16_t hidden_state[1024 * 4096];
@@ -76,14 +79,14 @@ class BertMemory : public Memory {
   };
 
  private:
-  std::unique_ptr<TensorImpl> input_ids_;
-  std::unique_ptr<TensorImpl> hidden_state_;
-  std::unique_ptr<TensorImpl> attention_mask_;
-  std::unique_ptr<TensorImpl> position_ids_cos_;
-  std::unique_ptr<TensorImpl> position_ids_sin_;
-  std::vector<std::unique_ptr<TensorImpl>> k_cache_;
-  std::vector<std::unique_ptr<TensorImpl>> v_cache_;
-  std::unique_ptr<TensorImpl> logits_;
+  std::unique_ptr<executorch::aten::TensorImpl> input_ids_;
+  std::unique_ptr<executorch::aten::TensorImpl> hidden_state_;
+  std::unique_ptr<executorch::aten::TensorImpl> attention_mask_;
+  std::unique_ptr<executorch::aten::TensorImpl> position_ids_cos_;
+  std::unique_ptr<executorch::aten::TensorImpl> position_ids_sin_;
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> k_cache_;
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> v_cache_;
+  std::unique_ptr<executorch::aten::TensorImpl> logits_;
   std::vector<int> shard_layers_;
   int num_heads_;
 };
@@ -117,13 +120,15 @@ class KVCachedMemory : public Memory {
  public:
   KVCachedMemory(
       const std::vector<std::string>& pos_embs_path,
-      std::vector<std::shared_ptr<Module>>& modules,
+      std::vector<std::shared_ptr<executorch::extension::Module>>& modules,
       std::vector<int> shard_layers);
-  void prepare_io(const std::vector<Result<MethodMeta>>& methods_meta) override;
+  void prepare_io(const std::vector<executorch::runtime::Result<
+                      executorch::runtime::MethodMeta>>& methods_meta) override;
   void update_io(
       int64_t cur_token,
       int64_t pos,
-      std::vector<std::vector<Tensor>>& output_tensors) override;
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
+      override;
   struct IO {
     int32_t input_ids;
     uint16_t hidden_state[4096];
@@ -142,16 +147,16 @@ class KVCachedMemory : public Memory {
   };
 
  private:
-  std::unique_ptr<TensorImpl> input_ids_;
-  std::unique_ptr<TensorImpl> hidden_state_;
-  std::unique_ptr<TensorImpl> attention_mask_;
-  std::unique_ptr<TensorImpl> position_ids_cos_;
-  std::unique_ptr<TensorImpl> position_ids_sin_;
-  std::vector<std::unique_ptr<TensorImpl>> k_cache_in_;
-  std::vector<std::unique_ptr<TensorImpl>> v_cache_in_;
-  std::vector<std::unique_ptr<TensorImpl>> k_cache_out_;
-  std::vector<std::unique_ptr<TensorImpl>> v_cache_out_;
-  std::unique_ptr<TensorImpl> logits_;
+  std::unique_ptr<executorch::aten::TensorImpl> input_ids_;
+  std::unique_ptr<executorch::aten::TensorImpl> hidden_state_;
+  std::unique_ptr<executorch::aten::TensorImpl> attention_mask_;
+  std::unique_ptr<executorch::aten::TensorImpl> position_ids_cos_;
+  std::unique_ptr<executorch::aten::TensorImpl> position_ids_sin_;
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> k_cache_in_;
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> v_cache_in_;
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> k_cache_out_;
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> v_cache_out_;
+  std::unique_ptr<executorch::aten::TensorImpl> logits_;
   std::vector<LoopRange> lr_update_kv_;
   std::vector<std::future<void>> futures_;
   ThreadPool thread_pool_;
@@ -159,5 +164,4 @@ class KVCachedMemory : public Memory {
   int num_heads_;
 };
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
index 30568b4b067..721c16209c2 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
@@ -17,7 +17,6 @@
 #include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/runner.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/llm/runner/util.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/platform/log.h>
@@ -30,8 +29,16 @@
 #include "arm_neon.h"
 #endif
 
-namespace torch {
-namespace executor {
+using executorch::aten::Tensor;
+using executorch::extension::Module;
+using executorch::extension::llm::Sampler;
+using executorch::extension::llm::time_in_ms;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+
+namespace example {
 
 namespace {
 static constexpr auto kTopp = 0.9f;
@@ -50,8 +57,6 @@ Runner::Runner(
     const int logits_offset)
     : tokenizer_path_(tokenizer_path),
       temperature_(temperature),
-      bos_id_(1),
-      eos_id_(2),
       n_bos_(1),
       n_eos_(1),
       vocab_size_(QAIHUB_LLAMA_LOGITS),
@@ -67,6 +72,21 @@ Runner::Runner(
   }
   ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str());
 
+// load tokenizer
+#if defined(QAIHUB_LLAMA3_RUNNER)
+  tokenizer_ = example::get_tiktoken_for_llama();
+  tokenizer_->load(tokenizer_path_);
+  eos_id_.insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]);
+  version_ = LlamaVersion::kLlama3;
+#else
+  tokenizer_ = std::make_unique<executorch::extension::llm::BPETokenizer>();
+  tokenizer_->load(tokenizer_path_);
+  version_ = LlamaVersion::kLlama2;
+#endif
+
+  bos_id_ = tokenizer_->bos_tok();
+  eos_id_.insert(tokenizer_->eos_tok());
+
   switch (eval_mode_) {
     case EvalMode::kBert:
       io_mem_ =
@@ -98,14 +118,6 @@ Error Runner::load() {
     ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("forward"));
   }
 
-// load tokenizer
-#if defined(QAIHUB_LLAMA3_RUNNER)
-  tokenizer_ = get_tiktoken_for_llama();
-#else
-  tokenizer_ = std::make_unique<BPETokenizer>();
-#endif
-  tokenizer_->load(tokenizer_path_);
-
   // create sampler
   sampler_ = std::make_unique<Sampler>(
       vocab_size_,
@@ -157,6 +169,7 @@ void Runner::run_model_step(std::vector<std::vector<EValue>>& inputs) {
 // TODO: add overloaded method for on-device tokenize
 Error Runner::generate(
     const std::string& prompt,
+    const std::string& system_prompt,
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
     std::function<void(const Stats&)> stats_callback) {
@@ -165,15 +178,14 @@ Error Runner::generate(
   std::vector<std::vector<Tensor>> input_tensors, output_tensors;
   std::vector<std::vector<EValue>> inputs;
   if (!is_loaded()) {
-    stats_.model_load_start_ms = util::time_in_ms();
+    stats_.model_load_start_ms = time_in_ms();
     ET_CHECK_OK_OR_RETURN_ERROR(load());
     for (int i = 0; i < modules_.size(); ++i) {
       input_tensors.emplace_back(io_mem_->get_input_tensors(i));
       output_tensors.emplace_back(io_mem_->get_output_tensors(i));
       for (size_t j = 0; j < output_tensors[i].size(); ++j) {
         ET_CHECK_MSG(
-            modules_[i]->set_output_data_ptr(output_tensors[i][j], j) ==
-                Error::Ok,
+            modules_[i]->set_output(output_tensors[i][j], j) == Error::Ok,
             "failed to set output tensor for module %d's %zu'th output",
             i,
             j);
@@ -181,17 +193,47 @@ Error Runner::generate(
       inputs.emplace_back(
           std::vector<EValue>(begin(input_tensors[i]), end(input_tensors[i])));
     }
-    stats_.model_load_end_ms = util::time_in_ms();
+    stats_.model_load_end_ms = time_in_ms();
   }
 
-  stats_.inference_start_ms = util::time_in_ms();
-  shouldStop_ = false;
+  stats_.inference_start_ms = time_in_ms();
   seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_;
 
+  std::string post_process_prompt;
+  switch (version_) {
+    case LlamaVersion::kLlama2:
+      post_process_prompt.append(prompt);
+      break;
+    case LlamaVersion::kLlama3:
+      if (!system_prompt.empty()) {
+        post_process_prompt.append(
+            "<|start_header_id|>system<|end_header_id|>\n\n");
+        post_process_prompt.append(system_prompt);
+        post_process_prompt.append("<|eot_id|>\n");
+      }
+      post_process_prompt.append(
+          "<|start_header_id|>user<|end_header_id|>\n\n");
+      post_process_prompt.append(prompt);
+      post_process_prompt.append(
+          "<|eot_id|><|start_header_id|>assistant<|end_header_id|>");
+      // tokenizer_->encode will add <|begin_of_text|> token for us.
+      // For now, do token call back so the output format looks the same as
+      // llama3 model card.
+      if (token_callback && eval_mode_ == EvalMode::kKVCached) {
+        token_callback("<|begin_of_text|>");
+      }
+      break;
+    default:
+      ET_CHECK_MSG(false, "unsupported llama version");
+      break;
+  }
+
   Result<std::vector<uint64_t>> encode_res =
-      tokenizer_->encode(prompt, n_bos_, 0);
+      tokenizer_->encode(post_process_prompt, n_bos_, 0);
   ET_CHECK_OK_OR_RETURN_ERROR(
-      encode_res.error(), "failed to encode prompt %s", prompt.c_str());
+      encode_res.error(),
+      "failed to encode prompt %s",
+      post_process_prompt.c_str());
 
   std::vector<uint64_t> prompt_tokens = encode_res.get();
   int num_prompt_tokens = prompt_tokens.size();
@@ -241,16 +283,15 @@ Error Runner::generate(
     Tensor& logits_tensor = output_tensors.back().back();
 
     if (pos == num_prompt_tokens) {
-      stats_.first_token_ms = util::time_in_ms();
+      stats_.first_token_ms = time_in_ms();
     } else if (pos == num_prompt_tokens - 1) {
-      stats_.prompt_eval_end_ms = util::time_in_ms();
+      stats_.prompt_eval_end_ms = time_in_ms();
     }
 
-    long sample_start_time_ms = util::time_in_ms();
+    long sample_start_time_ms = time_in_ms();
     prev_token = cur_token;
     cur_token = logitsToToken(logits_tensor);
-    stats_.aggregate_sampling_time_ms +=
-        util::time_in_ms() - sample_start_time_ms;
+    stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
 
     if (pos < num_prompt_tokens - 1) {
       cur_token = prompt_tokens[pos + 1];
@@ -264,16 +305,12 @@ Error Runner::generate(
       token_callback(piece_res.get().c_str());
     }
 
-    if (shouldStop_) {
-      break;
-    }
-
-    if (pos >= num_prompt_tokens && cur_token == eos_id_) {
+    if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) {
       ET_LOG(Info, "\nReached to the end of generation");
       break;
     }
   }
-  stats_.inference_end_ms = util::time_in_ms();
+  stats_.inference_end_ms = time_in_ms();
 
   if (pos == seq_len) {
     ET_LOG(Info, "\nSequence length (%i tokens) reached!", seq_len);
@@ -367,10 +404,6 @@ std::string statsToJsonString(const Runner::Stats& stats) {
 }
 } // namespace
 
-void Runner::stop() {
-  shouldStop_ = true;
-}
-
 std::vector<Result<MethodMeta>> Runner::get_methods_meta() {
   std::vector<Result<MethodMeta>> methods_meta;
   methods_meta.reserve(modules_.size());
@@ -379,5 +412,4 @@ std::vector<Result<MethodMeta>> Runner::get_methods_meta() {
   }
   return methods_meta;
 }
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
index f0b644071bb..0d15114bc64 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
@@ -21,10 +21,8 @@
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 
-namespace torch {
-namespace executor {
+namespace example {
 
 class Runner {
  public:
@@ -65,14 +63,16 @@ class Runner {
   };
 
   bool is_loaded() const;
-  Error load();
-  Error generate(
+  executorch::runtime::Error load();
+  executorch::runtime::Error generate(
       const std::string& prompt,
+      const std::string& system_prompt,
       int32_t seq_len,
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const Stats&)> stats_callback = {});
   void stop();
-  std::vector<Result<MethodMeta>> get_methods_meta();
+  std::vector<executorch::runtime::Result<executorch::runtime::MethodMeta>>
+  get_methods_meta();
 
  private:
   enum EvalMode {
@@ -81,27 +81,32 @@ class Runner {
     kUnsupported,
   };
 
-  int32_t logitsToToken(const exec_aten::Tensor& logits_tensor);
-  void run_model_step(std::vector<std::vector<EValue>>& inputs);
+  enum LlamaVersion {
+    kLlama2 = 0,
+    kLlama3,
+  };
+
+  int32_t logitsToToken(const executorch::aten::Tensor& logits_tensor);
+  void run_model_step(
+      std::vector<std::vector<executorch::runtime::EValue>>& inputs);
   // metadata
-  const int32_t bos_id_;
-  const int32_t eos_id_;
+  int32_t bos_id_;
+  std::unordered_set<uint64_t> eos_id_;
   const int32_t n_bos_;
   const int32_t n_eos_;
   const int32_t vocab_size_;
   const int32_t max_seq_len_;
   int32_t eval_mode_;
-  std::vector<std::shared_ptr<Module>> modules_;
+  std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
   std::string tokenizer_path_;
   float temperature_;
-  std::unique_ptr<Tokenizer> tokenizer_;
-  std::unique_ptr<Sampler> sampler_;
-  bool shouldStop_{false};
+  std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
+  std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
   Stats stats_;
   std::unique_ptr<Memory> io_mem_;
   const float logits_scale_;
   const int32_t logits_offset_;
+  LlamaVersion version_;
 };
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt
new file mode 100644
index 00000000000..8a2e69b3748
--- /dev/null
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt
@@ -0,0 +1,36 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# preprocess qaihub_stable_diffusion_runner_src files
+set(_qaihub_stable_diffusion_runner__srcs
+    ${CMAKE_CURRENT_LIST_DIR}/qaihub_stable_diffusion_runner.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h
+)
+
+# build qaihub_stable_diffusion_runner
+add_executable(
+  qaihub_stable_diffusion_runner ${_qaihub_stable_diffusion_runner__srcs}
+)
+target_include_directories(
+  qaihub_stable_diffusion_runner PUBLIC ${_common_include_directories}
+)
+target_link_libraries(
+  qaihub_stable_diffusion_runner
+  qnn_executorch_backend
+  executorch_no_prim_ops
+  extension_data_loader
+  extension_module
+  extension_tensor
+  gflags
+  re2::re2
+)
+target_compile_options(
+  qaihub_stable_diffusion_runner PUBLIC ${_common_compile_options}
+)
+set_target_properties(
+  qaihub_stable_diffusion_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+)
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md b/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md
new file mode 100644
index 00000000000..3b5a74c8238
--- /dev/null
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md
@@ -0,0 +1,35 @@
+# Summary
+
+## Overview
+This file provides you the instructions to run Stable-Diffusion-v2.1 with different parameters via Qualcomm HTP backend. We will demonstrate how to run Stable Diffusion v2.1 on mobile devices using context binaries from Qualcomm AI Hub’s Stable Diffusion v2.1
+
+Please check corresponding section for more information.
+
+## Stable-Diffusion-v2.1
+The model architecture, scheduler, and time embedding are from the [stabilityai/stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base).
+
+### Instructions
+#### Step 1: Setup
+1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
+2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
+
+#### Step2: Prepare Model
+1. Download the context binaries for TextEncoder, UNet, and VAEDecoder under https://huggingface.co/qualcomm/Stable-Diffusion-v2.1/tree/main
+2. Download vocab.json under https://huggingface.co/openai/clip-vit-base-patch32/tree/main
+
+
+#### Step3: Install Requirements
+Before running the code, you need to install the necessary Python packages.
+
+We have verified the code with `diffusers`==0.29.0 and `piq`==0.8.0. Please follow the instructions here to install the required items:
+```bash
+sh examples/qualcomm/qaihub_scripts/stable_diffusion/install_requirements.sh
+```
+
+#### Step4: Run default example
+In this example, we execute the script for 20 time steps with the `prompt` 'a photo of an astronaut riding a horse on mars':
+```bash
+python examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py -b build-android -m ${SOC_MODEL} --s ${SERIAL_NUM} --text_encoder_bin ${PATH_TO_TEXT_ENCODER_CONTEXT_BINARY} --unet_bin ${PATH_TO_UNET_CONTEXT_BINARY} --vae_bin ${PATH_TO_VAE_CONTEXT_BINARY} --vocab_json  ${PATH_TO_VOCAB_JSON_FILE} --num_time_steps 20 --prompt "a photo of an astronaut riding a horse on mars"
+```
+- Please replace `${PATH_TO_TEXT_ENCODER_CONTEXT_BINARY}`, `${PATH_TO_UNET_CONTEXT_BINARY}`, and `${PATH_TO_VAE_CONTEXT_BINARY}` with the actual paths to your AI Hub context binary files.
+- Please replace `${PATH_TO_VOCAB_JSON_FILE}` with the actual path to your vocab.json file.
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/install_requirements.sh b/examples/qualcomm/qaihub_scripts/stable_diffusion/install_requirements.sh
new file mode 100755
index 00000000000..bbb4767bee3
--- /dev/null
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/install_requirements.sh
@@ -0,0 +1,3 @@
+# For Stable Diffusion V2.1
+pip install diffusers==0.29.0
+pip install piq==0.8.0
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py
new file mode 100644
index 00000000000..defce876ba0
--- /dev/null
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py
@@ -0,0 +1,405 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+from multiprocessing.connection import Client
+
+import numpy as np
+import piq
+import torch
+from diffusers import EulerDiscreteScheduler, UNet2DConditionModel
+from diffusers.models.embeddings import get_timestep_embedding
+from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
+    QcomChipset,
+)
+
+from executorch.backends.qualcomm.utils.utils import (
+    from_context_binary,
+    generate_htp_compiler_spec,
+    generate_qnn_executorch_compiler_spec,
+)
+
+from executorch.examples.qualcomm.qaihub_scripts.stable_diffusion.stable_diffusion_lib import (
+    StableDiffusion,
+)
+from executorch.examples.qualcomm.qaihub_scripts.utils.utils import (
+    gen_pte_from_ctx_bin,
+    get_encoding,
+)
+from executorch.examples.qualcomm.utils import (
+    setup_common_args_and_variables,
+    SimpleADB,
+)
+from PIL import Image
+from torchvision.transforms import ToTensor
+
+target_names = ("text_encoder", "unet", "vae")
+
+
+def get_quant_data(
+    encoding: dict, data: torch.Tensor, input_model: str, input_index: int
+):
+    scale = encoding[f"{input_model}_input"]["scale"][input_index]
+    offset = encoding[f"{input_model}_input"]["offset"][input_index]
+    if offset < 0:
+        quant_data = data.div(scale).sub(offset).clip(min=0, max=65535).detach()
+    else:
+        quant_data = data.div(scale).add(offset).clip(min=0, max=65535).detach()
+
+    return quant_data.to(dtype=torch.uint16)
+
+
+def get_encodings(
+    path_to_shard_encoder: str,
+    path_to_shard_unet: str,
+    path_to_shard_vae: str,
+    compiler_specs,
+):
+    text_encoder_encoding = get_encoding(
+        path_to_shard=path_to_shard_encoder,
+        compiler_specs=compiler_specs,
+        get_input=False,
+        get_output=True,
+        num_input=1,
+        num_output=1,
+    )
+    unet_encoding = get_encoding(
+        path_to_shard=path_to_shard_unet,
+        compiler_specs=compiler_specs,
+        get_input=True,
+        get_output=True,
+        num_input=3,
+        num_output=1,
+    )
+    vae_encoding = get_encoding(
+        path_to_shard=path_to_shard_vae,
+        compiler_specs=compiler_specs,
+        get_input=True,
+        get_output=True,
+        num_input=1,
+        num_output=1,
+    )
+
+    return (
+        text_encoder_encoding[0],
+        unet_encoding[0],
+        unet_encoding[1],
+        vae_encoding[0],
+        vae_encoding[1],
+    )
+
+
+def get_time_embedding(timestep, time_embedding):
+    timestep = torch.tensor([timestep])
+    t_emb = get_timestep_embedding(timestep, 320, True, 0)
+    emb = time_embedding(t_emb)
+
+    return emb
+
+
+def build_args_parser():
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="Path for storing generated artifacts by this example. Default ./stable_diffusion_qai_hub",
+        default="./stable_diffusion_qai_hub",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--pte_prefix",
+        help="Prefix of pte files name. Default qaihub_stable_diffusion",
+        default="qaihub_stable_diffusion",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--text_encoder_bin",
+        type=str,
+        default=None,
+        help="[For AI hub ctx binary] Path to Text Encoder.",
+        required=True,
+    )
+
+    parser.add_argument(
+        "--unet_bin",
+        type=str,
+        default=None,
+        help="[For AI hub ctx binary] Path to UNet.",
+        required=True,
+    )
+
+    parser.add_argument(
+        "--vae_bin",
+        type=str,
+        default=None,
+        help="[For AI hub ctx binary] Path to Vae Decoder.",
+        required=True,
+    )
+
+    parser.add_argument(
+        "--prompt",
+        default="a photo of an astronaut riding a horse on mars",
+        type=str,
+        help="Prompt to generate image from.",
+    )
+
+    parser.add_argument(
+        "--num_time_steps",
+        default=20,
+        type=int,
+        help="The number of diffusion time steps.",
+    )
+
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=7.5,
+        help="Strength of guidance (higher means more influence from prompt).",
+    )
+
+    parser.add_argument(
+        "--vocab_json",
+        type=str,
+        help="Path to tokenizer vocab.json file. Can get vocab.json under https://huggingface.co/openai/clip-vit-base-patch32/tree/main",
+        required=True,
+    )
+
+    parser.add_argument(
+        "--pre_gen_pte",
+        help="folder path to pre-compiled ptes",
+        default=None,
+        type=str,
+    )
+
+    parser.add_argument(
+        "--fix_latents",
+        help="Enable this option to fix the latents in the unet diffuse step.",
+        action="store_true",
+    )
+
+    return parser
+
+
+def broadcast_ut_result(output_image, seed):
+    sd = StableDiffusion(seed)
+    to_tensor = ToTensor()
+    target = sd(args.prompt, 512, 512, args.num_time_steps)
+    target = to_tensor(target).unsqueeze(0)
+    output_tensor = to_tensor(
+        Image.fromarray(np.round(output_image[0] * 255).astype(np.uint8)[0])
+    ).unsqueeze(0)
+
+    psnr_piq = piq.psnr(target, output_tensor)
+    ssim_piq = piq.ssim(target, output_tensor)
+    print(f"PSNR: {round(psnr_piq.item(), 3)}, SSIM: {round(ssim_piq.item(), 3)}")
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({"PSNR": psnr_piq.item(), "SSIM": ssim_piq.item()}))
+
+
+def save_result(output_image):
+    img = Image.fromarray(np.round(output_image[0] * 255).astype(np.uint8)[0])
+    save_path = f"{args.artifact}/outputs/output_image.jpg"
+    img.save(save_path)
+    print(f"Output image saved at {save_path}")
+
+
+def inference(args, compiler_specs, pte_files):
+    # Loading a pretrained EulerDiscreteScheduler from the https://huggingface.co/stabilityai/stable-diffusion-2-1-base.
+    scheduler = EulerDiscreteScheduler.from_pretrained(
+        "stabilityai/stable-diffusion-2-1-base", subfolder="scheduler", revision="main"
+    )
+
+    #  Loading a pretrained UNet2DConditionModel (which includes the time embedding) from the https://huggingface.co/stabilityai/stable-diffusion-2-1-base.
+    time_embedding = UNet2DConditionModel.from_pretrained(
+        "stabilityai/stable-diffusion-2-1-base", subfolder="unet", revision="main"
+    ).time_embedding
+
+    scheduler.set_timesteps(args.num_time_steps)
+    scheduler.config.prediction_type = "epsilon"
+    # Get encoding of unet and vae
+    (
+        encoder_output,
+        unet_input,
+        unet_output,
+        vae_input,
+        vae_output,
+    ) = get_encodings(
+        args.text_encoder_bin,
+        args.unet_bin,
+        args.vae_bin,
+        compiler_specs,
+    )
+    encoding = {
+        "encoder_output": encoder_output,
+        "unet_input": unet_input,
+        "unet_output": unet_output,
+        "vae_input": vae_input,
+        "vae_output": vae_output,
+    }
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=args.build_folder,
+        pte_path=pte_files,
+        workspace=f"/data/local/tmp/executorch/{args.pte_prefix}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+        runner="examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion_runner",
+    )
+
+    input_unet = ()
+    input_list_unet = ""
+
+    for i, t in enumerate(scheduler.timesteps):
+        time_emb = get_quant_data(
+            encoding, get_time_embedding(t, time_embedding), "unet", 1
+        )
+        input_list_unet += f"input_{i}_0.raw\n"
+        input_unet = input_unet + (time_emb,)
+
+    qnn_executor_runner_args = [
+        f"--text_encoder_path {adb.workspace}/{args.pte_prefix}_text_encoder.pte",
+        f"--unet_path {adb.workspace}/{args.pte_prefix}_unet.pte",
+        f"--vae_path {adb.workspace}/{args.pte_prefix}_vae.pte",
+        f"--input_list_path {adb.workspace}/input_list.txt",
+        f"--output_folder_path {adb.output_folder}",
+        f'--prompt "{args.prompt}"',
+        f"--guidance_scale {args.guidance_scale}",
+        f"--num_time_steps {args.num_time_steps}",
+        f"--vocab_json {adb.workspace}/vocab.json",
+    ]
+    if args.fix_latents:
+        qnn_executor_runner_args.append("--fix_latents")
+
+    text_encoder_output_scale = encoding["encoder_output"]["scale"][0]
+    text_encoder_output_offset = encoding["encoder_output"]["offset"][0]
+    unet_input_latent_scale = encoding["unet_input"]["scale"][0]
+    unet_input_latent_offset = encoding["unet_input"]["offset"][0]
+    unet_input_text_emb_scale = encoding["unet_input"]["scale"][2]
+    unet_input_text_emb_offset = encoding["unet_input"]["offset"][2]
+    unet_output_scale = encoding["unet_output"]["scale"][0]
+    unet_output_offset = encoding["unet_output"]["offset"][0]
+    vae_input_scale = encoding["vae_input"]["scale"][0]
+    vae_input_offset = encoding["vae_input"]["offset"][0]
+    vae_output_scale = encoding["vae_output"]["scale"][0]
+    vae_output_offset = encoding["vae_output"]["offset"][0]
+
+    qnn_executor_runner_args = qnn_executor_runner_args + [
+        f"--text_encoder_output_scale {text_encoder_output_scale}",
+        f"--text_encoder_output_offset {text_encoder_output_offset}",
+        f"--unet_input_latent_scale {unet_input_latent_scale}",
+        f"--unet_input_latent_offset {unet_input_latent_offset}",
+        f"--unet_input_text_emb_scale {unet_input_text_emb_scale}",
+        f"--unet_input_text_emb_offset {unet_input_text_emb_offset}",
+        f"--unet_output_scale {unet_output_scale}",
+        f"--unet_output_offset {unet_output_offset}",
+        f"--vae_input_scale {vae_input_scale}",
+        f"--vae_input_offset {vae_input_offset}",
+        f"--vae_output_scale {vae_output_scale}",
+        f"--vae_output_offset {vae_output_offset}",
+    ]
+
+    qnn_executor_runner_args = " ".join(
+        [
+            f"cd {adb.workspace} &&",
+            f"./qaihub_stable_diffusion_runner {' '.join(qnn_executor_runner_args)}",
+        ]
+    )
+
+    files = [args.vocab_json]
+
+    if args.fix_latents:
+        seed = 42
+        latents = torch.randn((1, 4, 64, 64), generator=torch.manual_seed(seed)).to(
+            "cpu"
+        )
+        # We need to explicitly permute after init tensor or else the random value will be different
+        latents = latents.permute(0, 2, 3, 1).contiguous()
+        latents = latents * scheduler.init_noise_sigma
+        flattened_tensor = latents.view(-1)
+        # Save the flattened tensor to a .raw file
+        with open(os.path.join(args.artifact, "latents.raw"), "wb") as file:
+            file.write(flattened_tensor.numpy().tobytes())
+        files.append(os.path.join(args.artifact, "latents.raw"))
+
+    if not args.skip_push:
+        adb.push(inputs=input_unet, input_list=input_list_unet, files=files)
+    adb.execute(custom_runner_cmd=qnn_executor_runner_args)
+
+    output_image = []
+
+    def post_process_vae():
+        with open(f"{args.artifact}/outputs/output_0_0.raw", "rb") as f:
+            output_image.append(
+                np.fromfile(f, dtype=np.float32).reshape(1, 512, 512, 3)
+            )
+
+    adb.pull(output_path=args.artifact, callback=post_process_vae)
+
+    if args.fix_latents:
+        broadcast_ut_result(output_image, seed)
+    else:
+        save_result(output_image)
+
+
+def main(args):
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # common part for compile & inference
+    backend_options = generate_htp_compiler_spec(
+        use_fp16=False,
+        use_multi_contexts=True,
+    )
+    compiler_specs = generate_qnn_executorch_compiler_spec(
+        soc_model=getattr(QcomChipset, args.model),
+        backend_options=backend_options,
+        is_from_context_binary=True,
+    )
+
+    if args.pre_gen_pte is None:
+        # Create custom operators as context loader
+        bundle_programs = [
+            from_context_binary(args.text_encoder_bin, "ctx_loader_0"),
+            from_context_binary(args.unet_bin, "ctx_loader_1"),
+            from_context_binary(args.vae_bin, "ctx_loader_2"),
+        ]
+        pte_names = [f"{args.pte_prefix}_{target_name}" for target_name in target_names]
+        pte_files = gen_pte_from_ctx_bin(
+            args.artifact, pte_names, compiler_specs, bundle_programs
+        )
+        assert (
+            len(pte_files) == 3
+        ), f"Error: Expected 3 PTE files, but got {len(pte_files)} files."
+
+    else:
+        pte_files = [
+            f"{args.pre_gen_pte}/{args.pte_prefix}_{target_name}.pte"
+            for target_name in target_names
+        ]
+    if args.compile_only:
+        return
+
+    inference(args, compiler_specs, pte_files)
+
+
+if __name__ == "__main__":  # noqa: C901
+    parser = build_args_parser()
+    args = parser.parse_args()
+
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion_runner.cpp b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion_runner.cpp
new file mode 100644
index 00000000000..9c15ceadf8a
--- /dev/null
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion_runner.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gflags/gflags.h>
+
+DEFINE_string(
+    text_encoder_path,
+    "qaihub_stable_diffusion_text_encoder.pte",
+    "Text Encoder Model serialized in flatbuffer format.");
+DEFINE_string(
+    unet_path,
+    "qaihub_stable_diffusion_unet.pte",
+    "Unet Model serialized in flatbuffer format.");
+DEFINE_string(
+    vae_path,
+    "qaihub_stable_diffusion_vae.pte",
+    "Vae Model serialized in flatbuffer format.");
+DEFINE_string(
+    output_folder_path,
+    "outputs",
+    "Executorch inference data output path.");
+DEFINE_string(
+    input_list_path,
+    "input_list.txt",
+    "Input list storing time embedding.");
+DEFINE_string(
+    vocab_json,
+    "vocab.json",
+    "Json path to retrieve a list of vocabs.");
+DEFINE_string(
+    prompt,
+    "a photo of an astronaut riding a horse on mars",
+    "User input prompt");
+DEFINE_int32(num_time_steps, 20, "Number of time steps.");
+DEFINE_double(guidance_scale, 7.5, "Guidance Scale");
+
+DEFINE_double(text_encoder_output_scale, 0.0, "Text encoder output scale");
+DEFINE_int32(text_encoder_output_offset, 0, "Text encoder output offset");
+DEFINE_double(unet_input_latent_scale, 0.0, "Unet input latent scale");
+DEFINE_int32(unet_input_latent_offset, 0, "Unet input latent offset");
+DEFINE_double(unet_input_text_emb_scale, 0.0, "Unet input text emb scale");
+DEFINE_int32(unet_input_text_emb_offset, 0, "Unet input text emb offset");
+DEFINE_double(unet_output_scale, 0.0, "Unet output scale");
+DEFINE_int32(unet_output_offset, 0, "Unet output offset");
+DEFINE_double(vae_input_scale, 0.0, "Vae input scale");
+DEFINE_int32(vae_input_offset, 0, "Vae input offset");
+DEFINE_double(vae_output_scale, 0.0, "Vae output scale");
+DEFINE_int32(vae_output_offset, 0, "Vae output offset");
+DEFINE_bool(
+    fix_latents,
+    false,
+    "Enable this option to fix the latents in the unet diffuse step.");
+
+void usage_message() {
+  std::string usage_message =
+      "This is a sample executor runner capable of executing stable diffusion models."
+      "Users will need binary .pte program files for text_encoder, unet, and vae. Below are the options to retrieve required .pte program files:\n"
+      "For further information on how to generate the .pte program files and example command to execute this runner, please refer to qaihub_stable_diffsion.py.";
+  gflags::SetUsageMessage(usage_message);
+}
+
+using executorch::runtime::Error;
+
+int main(int argc, char** argv) {
+  executorch::runtime::runtime_init();
+  usage_message();
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  bool is_default =
+      gflags::GetCommandLineFlagInfoOrDie("text_encoder_output_scale")
+          .is_default ||
+      gflags::GetCommandLineFlagInfoOrDie("text_encoder_output_offset")
+          .is_default ||
+      gflags::GetCommandLineFlagInfoOrDie("unet_input_latent_scale")
+          .is_default ||
+      gflags::GetCommandLineFlagInfoOrDie("unet_input_latent_offset")
+          .is_default ||
+      gflags::GetCommandLineFlagInfoOrDie("unet_input_text_emb_scale")
+          .is_default ||
+      gflags::GetCommandLineFlagInfoOrDie("unet_input_text_emb_offset")
+          .is_default ||
+      gflags::GetCommandLineFlagInfoOrDie("unet_output_scale").is_default ||
+      gflags::GetCommandLineFlagInfoOrDie("unet_output_offset").is_default ||
+      gflags::GetCommandLineFlagInfoOrDie("vae_input_scale").is_default ||
+      gflags::GetCommandLineFlagInfoOrDie("vae_input_offset").is_default ||
+      gflags::GetCommandLineFlagInfoOrDie("vae_output_scale").is_default ||
+      gflags::GetCommandLineFlagInfoOrDie("vae_output_offset").is_default;
+
+  ET_CHECK_MSG(
+      !is_default,
+      "Please provide scale and offset for unet latent input, unet output, and vae input/output."
+      "Please refer to qaihub_stable_diffusion.py if you are unsure how to retrieve these values.");
+
+  ET_LOG(Info, "Stable Diffusion runner started");
+  std::vector<std::string> models_path = {
+      FLAGS_text_encoder_path, FLAGS_unet_path, FLAGS_vae_path};
+
+  // Create stable_diffusion_runner
+  example::Runner runner(
+      models_path,
+      FLAGS_num_time_steps,
+      FLAGS_guidance_scale,
+      FLAGS_text_encoder_output_scale,
+      FLAGS_text_encoder_output_offset,
+      FLAGS_unet_input_latent_scale,
+      FLAGS_unet_input_latent_offset,
+      FLAGS_unet_input_text_emb_scale,
+      FLAGS_unet_input_text_emb_offset,
+      FLAGS_unet_output_scale,
+      FLAGS_unet_output_offset,
+      FLAGS_vae_input_scale,
+      FLAGS_vae_input_offset,
+      FLAGS_vae_output_scale,
+      FLAGS_vae_output_offset,
+      FLAGS_output_folder_path,
+      FLAGS_fix_latents);
+
+  ET_CHECK_MSG(
+      runner.init_tokenizer(FLAGS_vocab_json) == Error::Ok,
+      "Runner failed to init tokenizer");
+
+  ET_CHECK_MSG(runner.load() == Error::Ok, "Runner failed to load method");
+
+  ET_CHECK_MSG(
+      runner.parse_input_list(FLAGS_input_list_path) == Error::Ok,
+      "Failed to parse time embedding input list");
+  ET_CHECK_MSG(
+      runner.generate(FLAGS_prompt) == Error::Ok, "Runner failed to generate");
+
+  ET_CHECK_MSG(
+      runner.print_performance() == Error::Ok,
+      "Runner failed to print performance");
+
+  return 0;
+}
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp
new file mode 100644
index 00000000000..cc54a801737
--- /dev/null
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp
@@ -0,0 +1,613 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple stable diffusion runner that includes preprocessing and post
+// processing logic. The module takes in a string as input and emits a tensor as
+// output.
+
+#include <executorch/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h>
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/extension/tensor/tensor.h>
+
+#include <ctime>
+#include <fstream>
+#include <random>
+#include <regex>
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/platform/log.h>
+
+using executorch::extension::from_blob;
+using executorch::extension::Module;
+using executorch::extension::TensorPtr;
+using executorch::extension::llm::time_in_ms;
+using executorch::runtime::Error;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+
+namespace example {
+
+Runner::Runner(
+    const std::vector<std::string>& models_path,
+    const int num_time_steps,
+    const float guidance_scale,
+    const float text_encoder_output_scale,
+    const int text_encoder_output_offset,
+    const float unet_input_latent_scale,
+    const int unet_input_latent_offset,
+    const float unet_input_text_emb_scale,
+    const float unet_input_text_emb_offset,
+    const float unet_output_scale,
+    const int unet_output_offset,
+    const float vae_input_scale,
+    const int vae_input_offset,
+    const float vae_output_scale,
+    const int vae_output_offset,
+    const std::string output_path,
+    const bool fix_latents)
+    : num_time_steps_(num_time_steps),
+      guidance_scale_(guidance_scale),
+      text_encoder_output_scale_(text_encoder_output_scale),
+      text_encoder_output_offset_(text_encoder_output_offset),
+      unet_input_latent_scale_(unet_input_latent_scale),
+      unet_input_latent_offset_(unet_input_latent_offset),
+      unet_input_text_emb_scale_(unet_input_text_emb_scale),
+      unet_input_text_emb_offset_(unet_input_text_emb_offset),
+      unet_output_scale_(unet_output_scale),
+      unet_output_offset_(unet_output_offset),
+      vae_input_scale_(vae_input_scale),
+      vae_input_offset_(vae_input_offset),
+      vae_output_scale_(vae_output_scale),
+      vae_output_offset_(vae_output_offset),
+      output_path_(output_path),
+      fix_latents_(fix_latents) {
+  for (int i = 0; i < models_path.size(); i++) {
+    modules_.push_back(std::make_unique<Module>(
+        models_path[i], Module::LoadMode::MmapUseMlockIgnoreErrors));
+    ET_LOG(Info, "creating module: model_path=%s", models_path[i].c_str());
+  }
+}
+
+std::vector<Result<MethodMeta>> Runner::get_methods_meta() {
+  std::vector<Result<MethodMeta>> methods_meta;
+  for (std::unique_ptr<Module>& module : modules_) {
+    methods_meta.emplace_back(module->method_meta("forward"));
+  }
+  return methods_meta;
+}
+
+bool Runner::is_loaded() const {
+  bool loaded = true;
+  for (const std::unique_ptr<Module>& module : modules_) {
+    loaded &= module->is_loaded();
+  }
+  return loaded;
+}
+
+Error Runner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+  stats_.model_load_start_ms = time_in_ms();
+  for (auto& module : modules_) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("forward"));
+  }
+  stats_.model_load_end_ms = time_in_ms();
+  return Error::Ok;
+}
+
+Error Runner::parse_input_list(std::string& path) {
+  // Fill in data for input
+  std::ifstream input_list(path);
+  time_emb_list_.reserve(num_time_steps_);
+  ET_CHECK_MSG(input_list.is_open(), "Input list error opening file");
+  std::string time_emb_file;
+  for (int i = 0; i < num_time_steps_; i++) {
+    std::getline(input_list, time_emb_file);
+    std::ifstream is;
+    is.open(time_emb_file, std::ios::binary);
+    is.seekg(0, std::ios::end);
+    size_t filesize = is.tellg();
+    is.seekg(0, std::ios::beg);
+    std::vector<uint16_t> time_emb;
+    time_emb.resize(filesize / sizeof(uint16_t));
+    is.read(reinterpret_cast<char*>(time_emb.data()), filesize);
+    time_emb_list_.push_back(time_emb);
+  }
+  return Error::Ok;
+}
+
+Error Runner::init_tokenizer(const std::string& vocab_json_path) {
+  ET_LOG(Info, "Loading Tokenizer from json");
+  stats_.tokenizer_load_start_ms = time_in_ms();
+  std::ifstream fin(vocab_json_path);
+  auto update_map = [this](std::string& target, std::regex& re) {
+    std::smatch sm;
+    std::regex_search(target, sm, re);
+    // replace special character, please extend this if any cornor case found
+    std::string text = sm[1];
+    std::unordered_map<std::string, std::regex> post_process = {
+        {"\"", std::regex(R"(\\\")")},
+        {" ", std::regex(R"(</w>)")},
+        {"\\", std::regex(R"(\\\\)")}};
+    for (auto& p : post_process) {
+      text = std::regex_replace(text, p.second, p.first);
+    }
+    vocab_to_token_map_[text] = std::stoi(sm[2]);
+  };
+
+  if (fin.is_open()) {
+    std::string line, text;
+    while (getline(fin, line)) {
+      text += line;
+    }
+    fin.close();
+
+    std::regex re_anchor(R"(\d,\")");
+    std::regex re_pattern(R"(\{?\"(.*)\":([\d]+)\}?)");
+    auto begin = std::sregex_iterator(text.begin(), text.end(), re_anchor);
+    auto end = std::sregex_iterator();
+    size_t pos = 0;
+    for (std::sregex_iterator iter = begin; iter != end; ++iter) {
+      std::smatch match;
+      size_t len = iter->position() - pos + 1;
+      std::string target = text.substr(pos, len);
+      update_map(target, re_pattern);
+      pos = iter->position() + 1;
+    }
+    // process last vocabulary
+    std::string target = text.substr(pos);
+    update_map(target, re_pattern);
+  }
+  stats_.tokenizer_load_end_ms = time_in_ms();
+  return Error::Ok;
+}
+
+std::vector<int> Runner::tokenize(std::string prompt) {
+  std::string bos("<|startoftext|>"), eos("<|endoftext|>");
+  std::vector<std::string> vocabs;
+  vocabs.reserve(max_tokens_);
+  std::vector<int32_t> tokens(1, vocab_to_token_map_[bos]);
+
+  // pretokenize
+  // ref: https://github.com/monatis/clip.cpp
+  //      https://huggingface.co/openai/clip-vit-base-patch32
+  std::string text;
+  std::regex re(
+      R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)");
+  std::smatch sm;
+  while (std::regex_search(prompt, sm, re)) {
+    for (auto& v : sm) {
+      vocabs.push_back(v);
+    }
+    prompt = sm.suffix();
+  }
+  for (std::string& v : vocabs) {
+    std::string word = (v[0] == ' ') ? v.substr(1) : v;
+    word += " ";
+    auto iter = vocab_to_token_map_.find(word);
+    if (iter != vocab_to_token_map_.end()) {
+      tokens.push_back(iter->second);
+      continue;
+    }
+    for (int i = 0; i < v.size(); ++i) {
+      for (int j = v.size() - 1; j >= i; --j) {
+        std::string token = v.substr(i, j - 1 + 1);
+        auto iter = vocab_to_token_map_.find(token);
+        if (iter != vocab_to_token_map_.end()) {
+          tokens.push_back(iter->second);
+          i = j + 1;
+          break;
+        } else if (j == i) {
+          ET_LOG(Error, "unknown token found: %s", token.c_str());
+        }
+      }
+    }
+  }
+  tokens.push_back(vocab_to_token_map_[eos]);
+  return tokens;
+}
+
+std::vector<float> Runner::gen_latent_from_file() {
+  std::vector<float> tensor_vector;
+  std::ifstream file("latents.raw", std::ios::binary);
+  if (!file.is_open()) {
+    ET_LOG(Error, "Error opening file!");
+    return tensor_vector;
+  }
+
+  // Read the tensor data
+  float value;
+  while (file.read(reinterpret_cast<char*>(&value), sizeof(float))) {
+    tensor_vector.push_back(value);
+  }
+  file.close();
+  return tensor_vector;
+}
+
+std::vector<float> Runner::gen_random_latent(float sigma) {
+  std::random_device rnd_device;
+  std::mt19937 mersenne_engine{rnd_device()};
+  std::normal_distribution<float> dist{0.0f, 1.0f};
+
+  constexpr int latent_size = 1 * 64 * 64 * 4;
+  std::vector<float> random_vector(latent_size);
+
+  for (float& value : random_vector) {
+    value = dist(mersenne_engine) * sigma;
+  }
+  return random_vector;
+}
+
+std::vector<float> Runner::get_time_steps() {
+  std::vector<float> time_steps(num_time_steps_);
+  for (int i = 0; i < num_time_steps_; ++i) {
+    time_steps[i] = (num_train_timesteps_ - 1) *
+        (1.0f - static_cast<float>(i) / (num_time_steps_ - 1));
+  }
+  return time_steps;
+}
+
+std::vector<float> Runner::get_sigmas(const std::vector<float>& time_steps) {
+  float start = std::sqrt(beta_start_);
+  float end = std::sqrt(beta_end_);
+  std::vector<float> betas(num_train_timesteps_);
+  float step = (end - start) / (num_train_timesteps_ - 1);
+  for (int i = 0; i < num_train_timesteps_; ++i) {
+    float value = start + i * step;
+    betas[i] = 1 - (value * value);
+  }
+
+  std::vector<float> alphas_cumprod(num_train_timesteps_);
+  float cumprod = 1.0;
+  for (int i = 0; i < num_train_timesteps_; ++i) {
+    cumprod *= betas[i];
+    alphas_cumprod[i] = cumprod;
+  }
+
+  std::vector<float> sigmas(num_train_timesteps_);
+  for (int i = 0; i < num_train_timesteps_; ++i) {
+    sigmas[i] = std::sqrt((1.0 - alphas_cumprod[i]) / alphas_cumprod[i]);
+  }
+
+  std::vector<float> res(time_steps.size());
+  for (size_t i = 0; i < time_steps.size(); ++i) {
+    float index =
+        static_cast<float>(i) * (sigmas.size() - 1) / (time_steps.size() - 1);
+    size_t lower_index = static_cast<size_t>(std::floor(index));
+    size_t upper_index = static_cast<size_t>(std::ceil(index));
+
+    float weight = index - lower_index;
+    res[i] =
+        (1.0 - weight) * sigmas[lower_index] + weight * sigmas[upper_index];
+  }
+  std::reverse(res.begin(), res.end());
+  res.push_back(0);
+
+  return res;
+}
+
+void Runner::scale_model_input(
+    const std::vector<float>& latents,
+    std::vector<float>& latent_model_input,
+    float sigma) {
+  for (int i = 0; i < latents.size(); i++) {
+    latent_model_input[i] = (latents[i] / std::sqrt(sigma * sigma + 1));
+  }
+}
+
+void Runner::quant_tensor(
+    const std::vector<float>& fp_vec,
+    std::vector<uint16_t>& quant_vec,
+    float scale,
+    int offset) {
+  offset = abs(offset);
+  for (int i = 0; i < fp_vec.size(); i++) {
+    quant_vec[i] = static_cast<uint16_t>((fp_vec[i] / scale) + offset);
+  }
+}
+
+void Runner::dequant_tensor(
+    const std::vector<uint16_t>& quant_vec,
+    std::vector<float>& fp_vec,
+    float scale,
+    int offset) {
+  offset = abs(offset);
+  for (int i = 0; i < quant_vec.size(); i++) {
+    fp_vec[i] = (quant_vec[i] - offset) * scale;
+  }
+}
+
+// Using the same algorithm as EulerDiscreteScheduler in python.
+void Runner::step(
+    const std::vector<float>& model_output,
+    const std::vector<float>& sigmas,
+    std::vector<float>& sample,
+    std::vector<float>& prev_sample,
+    int step_index) {
+  float sigma = sigmas[step_index];
+  float dt = sigmas[step_index + 1] - sigma;
+
+  for (int i = 0; i < sample.size(); ++i) {
+    float sigma_hat = sample[i] - (sigma * model_output[i]);
+    prev_sample[i] = (sample[i] - sigma_hat) / sigma;
+    prev_sample[i] = sample[i] + (prev_sample[i] * dt);
+  }
+  sample = prev_sample;
+}
+
+Error Runner::generate(std::string prompt) {
+  ET_LOG(Info, "Start generating");
+  stats_.generate_start_ms = time_in_ms();
+
+  // Start tokenize
+  stats_.tokenizer_parsing_start_ms = time_in_ms();
+  std::vector<int32_t> cond_tokens = tokenize(prompt);
+  cond_tokens.resize(max_tokens_);
+  std::vector<int32_t> uncond_tokens = tokenize("");
+  uncond_tokens.resize(max_tokens_);
+  stats_.tokenizer_parsing_end_ms = time_in_ms();
+
+  std::vector<Result<MethodMeta>> method_metas = get_methods_meta();
+
+  MethodMeta encoder_method_meta = method_metas[0].get();
+  // Initialize text_encoder input tensors: cond/uncond tokenized_input[1,77]
+  auto cond_tokens_tensor = from_blob(
+      cond_tokens.data(),
+      {1, 77},
+      encoder_method_meta.input_tensor_meta(0)->scalar_type());
+  auto uncond_tokens_tensor = from_blob(
+      uncond_tokens.data(),
+      {1, 77},
+      encoder_method_meta.input_tensor_meta(0)->scalar_type());
+  // Initialize text_encoder output tensors: cond/uncond embedding[1, 77, 1024]
+  constexpr int emb_size = 1 * 77 * 1024;
+  std::vector<uint16_t> cond_emb_vec(emb_size);
+  std::vector<uint16_t> uncond_emb_vec(emb_size);
+  std::vector<float> fp_emb_vec(emb_size);
+  auto cond_emb_tensor = from_blob(
+      cond_emb_vec.data(),
+      {1, 77, 1024},
+      encoder_method_meta.output_tensor_meta(0)->scalar_type());
+  auto uncond_emb_tensor = from_blob(
+      uncond_emb_vec.data(),
+      {1, 77, 1024},
+      encoder_method_meta.output_tensor_meta(0)->scalar_type());
+  modules_[0]->set_output(cond_emb_tensor);
+  long encoder_start = time_in_ms();
+  auto cond_res = modules_[0]->forward(cond_tokens_tensor);
+  stats_.text_encoder_execution_time += (time_in_ms() - encoder_start);
+  modules_[0]->set_output(uncond_emb_tensor);
+  encoder_start = time_in_ms();
+  auto uncond_res = modules_[0]->forward(uncond_tokens_tensor);
+  stats_.text_encoder_execution_time += (time_in_ms() - encoder_start);
+
+  // Initialize unet parameters
+  MethodMeta unet_method_meta = method_metas[1].get();
+  std::vector<float> time_steps = get_time_steps();
+  std::vector<float> sigmas = get_sigmas(time_steps);
+  float max_sigma = *std::max_element(sigmas.begin(), sigmas.end());
+  std::vector<float> latent;
+  if (fix_latents_) {
+    latent = gen_latent_from_file();
+  } else {
+    latent = gen_random_latent(max_sigma);
+  }
+  std::vector<float> prev_sample(latent.size());
+
+  // Initialize unet input tensors
+  //  1. latent[1,64,64,4]
+  //  2. time_embedding[1,1280]
+  //  3. cond/uncond embedding[1,77,1024]
+  std::vector<uint16_t> latent_model_input(latent.size());
+  std::vector<float> fp_latent_model_input(latent.size());
+  auto latent_tensor = from_blob(
+      latent_model_input.data(),
+      {1, 64, 64, 4},
+      unet_method_meta.input_tensor_meta(0)->scalar_type());
+  std::vector<TensorPtr> time_emb_tensors;
+  time_emb_tensors.reserve(num_time_steps_);
+  for (auto step_index = 0; step_index < num_time_steps_; step_index++) {
+    time_emb_tensors.emplace_back(from_blob(
+        time_emb_list_[step_index].data(),
+        {1, 1280},
+        unet_method_meta.input_tensor_meta(1)->scalar_type()));
+  }
+  // requantize text encoders output
+  dequant_tensor(
+      cond_emb_vec,
+      fp_emb_vec,
+      text_encoder_output_scale_,
+      text_encoder_output_offset_);
+  quant_tensor(
+      fp_emb_vec,
+      cond_emb_vec,
+      unet_input_text_emb_scale_,
+      unet_input_text_emb_offset_);
+  dequant_tensor(
+      uncond_emb_vec,
+      fp_emb_vec,
+      text_encoder_output_scale_,
+      text_encoder_output_offset_);
+  quant_tensor(
+      fp_emb_vec,
+      uncond_emb_vec,
+      unet_input_text_emb_scale_,
+      unet_input_text_emb_offset_);
+
+  // Initialize unet output tensors: text/uncond noise_pred[1,64,64,4]
+  std::vector<uint16_t> noise_pred_text(latent.size());
+  std::vector<uint16_t> noise_pred_uncond(latent.size());
+  std::vector<float> fp_noise_pred_text(noise_pred_text.size());
+  std::vector<float> fp_noise_pred_uncond(noise_pred_uncond.size());
+  auto noise_pred_text_tensor = from_blob(
+      noise_pred_text.data(),
+      {1, 64, 64, 4},
+      unet_method_meta.output_tensor_meta(0)->scalar_type());
+  auto noise_pred_uncond_tensor = from_blob(
+      noise_pred_uncond.data(),
+      {1, 64, 64, 4},
+      unet_method_meta.output_tensor_meta(0)->scalar_type());
+
+  // Execute unet
+  for (int step_index = 0; step_index < num_time_steps_; step_index++) {
+    long start_post_process = time_in_ms();
+    scale_model_input(latent, fp_latent_model_input, sigmas[step_index]);
+
+    quant_tensor(
+        fp_latent_model_input,
+        latent_model_input,
+        unet_input_latent_scale_,
+        unet_input_latent_offset_);
+
+    stats_.unet_aggregate_post_processing_time +=
+        (time_in_ms() - start_post_process);
+    modules_[1]->set_output(noise_pred_text_tensor);
+    long start_unet_execution = time_in_ms();
+    auto cond_res = modules_[1]->forward(
+        {latent_tensor, time_emb_tensors[step_index], cond_emb_tensor});
+    stats_.unet_aggregate_execution_time +=
+        (time_in_ms() - start_unet_execution);
+    modules_[1]->set_output(noise_pred_uncond_tensor);
+    start_unet_execution = time_in_ms();
+    auto uncond_res = modules_[1]->forward(
+        {latent_tensor,
+         time_emb_tensors[step_index],
+         uncond_emb_tensor}); // results in noise_pred_uncond_vec
+    stats_.unet_aggregate_execution_time +=
+        (time_in_ms() - start_unet_execution);
+
+    // start unet post processing
+    start_post_process = time_in_ms();
+
+    dequant_tensor(
+        noise_pred_text,
+        fp_noise_pred_text,
+        unet_output_scale_,
+        unet_output_offset_);
+    dequant_tensor(
+        noise_pred_uncond,
+        fp_noise_pred_uncond,
+        unet_output_scale_,
+        unet_output_offset_);
+
+    for (int i = 0; i < fp_noise_pred_text.size(); i++) {
+      fp_noise_pred_text[i] = fp_noise_pred_uncond[i] +
+          guidance_scale_ * (fp_noise_pred_text[i] - fp_noise_pred_uncond[i]);
+    }
+    step(fp_noise_pred_text, sigmas, latent, prev_sample, step_index);
+    stats_.unet_aggregate_post_processing_time +=
+        (time_in_ms() - start_post_process);
+  }
+
+  // Start VAE
+  MethodMeta vae_method_meta = method_metas[2].get();
+  // Initialize vae input tensor : latent[1,64,64,4]
+  std::vector<uint16_t> vae_input(latent.size());
+  auto vae_input_tensor = from_blob(
+      vae_input.data(),
+      {1, 64, 64, 4},
+      vae_method_meta.input_tensor_meta(0)->scalar_type());
+  // Intialize vae output tensor: output[1,512,512,3]
+  constexpr int image_size = 1 * 512 * 512 * 3;
+  std::vector<uint16_t> q_out(image_size);
+  std::vector<float> out(image_size);
+  auto output_tensor = from_blob(
+      q_out.data(),
+      {1, 512, 512, 3},
+      vae_method_meta.output_tensor_meta(0)->scalar_type());
+
+  quant_tensor(latent, vae_input, vae_input_scale_, vae_input_offset_);
+
+  modules_[2]->set_output(output_tensor);
+  long start_vae_execution = time_in_ms();
+  auto vae_res = modules_[2]->forward(vae_input_tensor);
+  stats_.vae_execution_time = (time_in_ms() - start_vae_execution);
+  stats_.generate_end_ms = time_in_ms();
+
+  // Dequant uint16 output to fp32 output
+  dequant_tensor(q_out, out, vae_output_scale_, vae_output_offset_);
+
+  // Saving outputs
+  auto output_file_name = output_path_ + "/output_0_0.raw";
+  std::ofstream fout(output_file_name.c_str(), std::ios::binary);
+  fout.write(
+      reinterpret_cast<const char*>(out.data()), out.size() * sizeof(float));
+  fout.close();
+
+  return Error::Ok;
+}
+
+Error Runner::print_performance() {
+  ET_LOG(Info, "\tTotal Number of steps:\t\t\t\t%d", num_time_steps_);
+
+  ET_LOG(
+      Info,
+      "\tTokenizer Load Time:\t\t\t\t%f (seconds)",
+      ((double)(stats_.tokenizer_load_end_ms - stats_.tokenizer_load_start_ms) /
+       stats_.SCALING_FACTOR_UNITS_PER_SECOND));
+
+  ET_LOG(
+      Info,
+      "\tModel Load Time:\t\t\t\t%f (seconds)",
+      ((double)(stats_.model_load_end_ms - stats_.model_load_start_ms) /
+       stats_.SCALING_FACTOR_UNITS_PER_SECOND));
+
+  ET_LOG(
+      Info,
+      "\tGenerate Time(Tokenize + Encoder + UNet + VAE):\t%f (seconds)",
+      ((double)(stats_.generate_end_ms - stats_.generate_start_ms) /
+       stats_.SCALING_FACTOR_UNITS_PER_SECOND));
+
+  ET_LOG(
+      Info,
+      "\tTokenize Time:\t\t\t\t\t%f (seconds)",
+      ((double)(stats_.tokenizer_parsing_end_ms -
+                stats_.tokenizer_parsing_start_ms) /
+       stats_.SCALING_FACTOR_UNITS_PER_SECOND));
+
+  ET_LOG(
+      Info,
+      "\tText Encoder Execution Time:\t\t\t%f (seconds)",
+      ((double)(stats_.text_encoder_execution_time) /
+       stats_.SCALING_FACTOR_UNITS_PER_SECOND));
+
+  ET_LOG(
+      Info,
+      "\tUnet Aggregate (Cond + Uncond) Execution Time:\t%f (seconds)",
+      ((double)stats_.unet_aggregate_execution_time /
+       (stats_.SCALING_FACTOR_UNITS_PER_SECOND)));
+
+  ET_LOG(
+      Info,
+      "\tUnet Average Execution Time:\t\t\t%f (seconds)",
+      ((double)(stats_.unet_aggregate_execution_time / (num_time_steps_ * 2)) /
+       (stats_.SCALING_FACTOR_UNITS_PER_SECOND)));
+
+  ET_LOG(
+      Info,
+      "\tUnet Aggregate Post-Processing Time:\t\t%f (seconds)",
+      ((double)(stats_.unet_aggregate_post_processing_time) /
+       stats_.SCALING_FACTOR_UNITS_PER_SECOND));
+
+  ET_LOG(
+      Info,
+      "\tUnet Average Post-Processing Time:\t\t%f (seconds)",
+      ((double)(stats_.unet_aggregate_post_processing_time /
+                (num_time_steps_ * 2)) /
+       (stats_.SCALING_FACTOR_UNITS_PER_SECOND)));
+
+  ET_LOG(
+      Info,
+      "\tVAE Execution Time:\t\t\t\t%f (seconds)",
+      ((double)(stats_.vae_execution_time) /
+       stats_.SCALING_FACTOR_UNITS_PER_SECOND));
+  return Error::Ok;
+}
+
+} // namespace example
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h
new file mode 100644
index 00000000000..f91efd5b832
--- /dev/null
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple diffusion runner that includes preprocessing and post processing
+// logic. The module takes in a string as input and emites a tensor as output.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <executorch/extension/module/module.h>
+
+namespace example {
+
+class Runner {
+ public:
+  explicit Runner(
+      const std::vector<std::string>& models_path,
+      const int num_time_steps,
+      const float guidance_scale,
+      const float text_encoder_output_scale,
+      const int text_encoder_output_offset,
+      const float unet_input_latent_scale,
+      const int unet_input_latent_offset,
+      const float unet_input_text_emb_scale,
+      const float unet_input_text_emb_offset,
+      const float unet_output_scale,
+      const int unet_output_offset,
+      const float vae_input_scale,
+      const int vae_input_offset,
+      const float vae_output_scale,
+      const int vae_output_offset,
+      const std::string output_path,
+      const bool fix_latents);
+
+  struct Stats {
+    // Scaling factor for timestamps - in this case, we use ms.
+    const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
+    // Time stamps for the different stages of the execution
+    // model_load_start_ms: Model loading time
+    long model_load_start_ms;
+    long model_load_end_ms;
+
+    // tokenizer loading time
+    long tokenizer_load_start_ms = 0;
+    long tokenizer_load_end_ms = 0;
+
+    // tokenizer parsing time
+    long tokenizer_parsing_start_ms = 0;
+    long tokenizer_parsing_end_ms = 0;
+
+    // Total time to run generate
+    long generate_start_ms = 0;
+    long generate_end_ms = 0;
+
+    // text encoder execution time
+    long text_encoder_execution_time = 0;
+
+    // Unet aggregation execution time over n steps for cond + uncond
+    long unet_aggregate_execution_time = 0;
+
+    // UNet aggregation post processing time over n steps for cond + uncond.
+    // This is the time from processing unet's output until feeding it into the
+    // next iteration.
+    long unet_aggregate_post_processing_time = 0;
+
+    // VAE execution time
+    long vae_execution_time = 0;
+  };
+
+  bool is_loaded() const;
+  executorch::runtime::Error load();
+  executorch::runtime::Error init_tokenizer(const std::string& vocab_json_path);
+  executorch::runtime::Error print_performance();
+  std::vector<int> tokenize(std::string prompt);
+  std::vector<float> gen_latent_from_file();
+  std::vector<float> gen_random_latent(float sigma);
+  void step(
+      const std::vector<float>& model_output,
+      const std::vector<float>& sigmas,
+      std::vector<float>& sample,
+      std::vector<float>& prev_sample,
+      int step_index);
+  std::vector<executorch::runtime::Result<executorch::runtime::MethodMeta>>
+  get_methods_meta();
+  std::vector<float> get_time_steps();
+  std::vector<float> get_sigmas(const std::vector<float>& time_steps);
+  void scale_model_input(
+      const std::vector<float>& vec,
+      std::vector<float>& latent_model_input,
+      float sigma);
+  executorch::runtime::Error parse_input_list(std::string& path);
+  executorch::runtime::Error generate(std::string prompt);
+  void quant_tensor(
+      const std::vector<float>& fp_vec,
+      std::vector<uint16_t>& quant_vec,
+      float scale,
+      int offset);
+  void dequant_tensor(
+      const std::vector<uint16_t>& quant_vec,
+      std::vector<float>& fp_vec,
+      float scale,
+      int offset);
+
+ private:
+  Stats stats_;
+  std::vector<std::unique_ptr<executorch::extension::Module>> modules_;
+  std::vector<std::vector<uint16_t>> time_emb_list_;
+  std::unordered_map<std::string, int32_t> vocab_to_token_map_;
+
+  std::string output_path_;
+  int num_time_steps_;
+  float guidance_scale_;
+  float text_encoder_output_scale_;
+  int text_encoder_output_offset_;
+  float unet_input_latent_scale_;
+  int unet_input_latent_offset_;
+  float unet_input_text_emb_scale_;
+  int unet_input_text_emb_offset_;
+  float unet_output_scale_;
+  int unet_output_offset_;
+  float vae_input_scale_;
+  int vae_input_offset_;
+  float vae_output_scale_;
+  int vae_output_offset_;
+  const float beta_start_ = 0.00085;
+  const float beta_end_ = 0.012;
+  const int num_train_timesteps_ = 1000;
+  const int max_tokens_ = 77;
+  const bool fix_latents_ = false;
+};
+
+} // namespace example
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/stable_diffusion_lib.py b/examples/qualcomm/qaihub_scripts/stable_diffusion/stable_diffusion_lib.py
new file mode 100644
index 00000000000..8ec5783131d
--- /dev/null
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/stable_diffusion_lib.py
@@ -0,0 +1,22 @@
+import torch
+from diffusers import EulerDiscreteScheduler, StableDiffusionPipeline
+
+
+class StableDiffusion:
+    def __init__(self, seed=42):
+        self.model_id: str = "stabilityai/stable-diffusion-2-1-base"
+        self.generator = torch.manual_seed(seed)
+        self.scheduler = EulerDiscreteScheduler.from_pretrained(
+            self.model_id, subfolder="scheduler"
+        )
+
+        self.pipe = StableDiffusionPipeline.from_pretrained(
+            self.model_id, scheduler=self.scheduler, torch_dtype=torch.float32
+        )
+        self.pipe = self.pipe.to("cpu")
+
+    def __call__(self, prompt, height, width, num_time_steps):
+        image = self.pipe(
+            prompt, height, width, num_time_steps, generator=self.generator
+        ).images[0]
+        return image
diff --git a/examples/qualcomm/qaihub_scripts/utils/README.md b/examples/qualcomm/qaihub_scripts/utils/README.md
index facc1da76e8..61f465f3926 100644
--- a/examples/qualcomm/qaihub_scripts/utils/README.md
+++ b/examples/qualcomm/qaihub_scripts/utils/README.md
@@ -20,7 +20,7 @@ If users are interested in well-known applications, [Qualcomm AI HUB](https://ai
 ### Dependencies
 
 * Register for Qualcomm AI HUB.
-* Download the corresponding QNN SDK via shit [link](https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk) which your favorite model is compiled with. Ths link will automatically download the latest version at this moment (users should be able to specify version soon, please refer to [this](../../../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md#software) for earlier releases).
+* Download the corresponding QNN SDK via [link](https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk) which your favorite model is compiled with. Ths link will automatically download the latest version at this moment (users should be able to specify version soon, please refer to [this](../../../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md#software) for earlier releases).
 
 ### Target Model
 
diff --git a/examples/qualcomm/qaihub_scripts/utils/export.py b/examples/qualcomm/qaihub_scripts/utils/export.py
index 9dfe6796491..b742f59f1d4 100644
--- a/examples/qualcomm/qaihub_scripts/utils/export.py
+++ b/examples/qualcomm/qaihub_scripts/utils/export.py
@@ -220,7 +220,6 @@ def compile(args):
     )
     # setup memory planning
     memory_planning_pass = MemoryPlanningPass(
-        memory_planning_algo="greedy",
         alloc_graph_input=args.allocate_graph_io,
         alloc_graph_output=args.allocate_graph_io,
     )
diff --git a/examples/qualcomm/qaihub_scripts/utils/utils.py b/examples/qualcomm/qaihub_scripts/utils/utils.py
new file mode 100644
index 00000000000..67d519a688e
--- /dev/null
+++ b/examples/qualcomm/qaihub_scripts/utils/utils.py
@@ -0,0 +1,87 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import gc
+
+import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor
+
+from executorch.backends.qualcomm.utils.utils import (
+    canonicalize_program,
+    generate_qnn_executorch_option,
+)
+from executorch.exir.backend.backend_api import to_backend
+from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
+
+
+def get_encoding(
+    path_to_shard: str,
+    compiler_specs: str,
+    get_input: bool,
+    get_output: bool,
+    num_input: int,
+    num_output: int,
+):
+    encoding_list = []
+    with open(path_to_shard, "rb") as f:
+        ctx_bin = f.read()
+        qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+            generate_qnn_executorch_option(compiler_specs), ctx_bin
+        )
+        assert qnn_mgr.Init().value == 0, "failed to load context binary"
+        qnn_mgr.AllocateTensor()
+        if get_input:
+            encoding_input = {"scale": [], "offset": []}
+            for i in range(num_input):
+                inputs = qnn_mgr.GetGraphInputs()[i]
+                encoding = inputs.GetEncodings()
+                encoding_input["scale"].append(encoding.data["scale"].item())
+                encoding_input["offset"].append(encoding.data["offset"].item())
+            encoding_list.append(encoding_input)
+        if get_output:
+            encoding_output = {"scale": [], "offset": []}
+            for i in range(num_output):
+                outputs = qnn_mgr.GetGraphOutputs()[i]
+                encoding = outputs.GetEncodings()
+                encoding_output["scale"].append(encoding.data["scale"].item())
+                encoding_output["offset"].append(encoding.data["offset"].item())
+            encoding_list.append(encoding_output)
+        qnn_mgr.Destroy()
+    return encoding_list
+
+
+def gen_pte_from_ctx_bin(
+    artifact, pte_names, compiler_specs, bundle_programs, custom_spill_fill=None
+):
+
+    # Lower with QnnBackend
+    lowered_modules = [
+        to_backend("QnnBackend", prog["edge_program"], compiler_specs)
+        for prog in bundle_programs
+    ]
+    # Setup spill-fill buffer for relieving runtime memory usage
+    canonicalize_program(lowered_modules, custom_buffer_size=custom_spill_fill)
+    # export pte files
+    pte_files = []
+    for pte_name in pte_names:
+        print(f"{pte_name} generating...")
+        memory_planning_pass = MemoryPlanningPass(
+            memory_planning_algo="greedy",
+            alloc_graph_input=False,
+            alloc_graph_output=False,
+        )
+        pte_files.append(f"{artifact}/{pte_name}.pte")
+        with open(pte_files[-1], "wb") as file:
+            file.write(
+                lowered_modules[0].buffer(
+                    extract_delegate_segments=True, memory_planning=memory_planning_pass
+                )
+            )
+        # GC for reducing host memory consuming
+        bundle_programs.pop(0)
+        lowered_modules.pop(0)
+        gc.collect()
+
+    return pte_files
diff --git a/examples/qualcomm/qnn_intermediate_output_inspector.py b/examples/qualcomm/qnn_intermediate_output_inspector.py
new file mode 100644
index 00000000000..59e1a279d82
--- /dev/null
+++ b/examples/qualcomm/qnn_intermediate_output_inspector.py
@@ -0,0 +1,52 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+from executorch.devtools import Inspector
+
+
+def main(args):
+    # Create an Inspector instance with etdump and the debug buffer.
+    inspector = Inspector(
+        etdump_path=args.etdump_path,
+        etrecord=args.etrecord_path,
+        debug_buffer_path=args.debug_buffer_path,
+    )
+
+    # Accessing intermediate outputs from each event (an event here is essentially an instruction that executed in the runtime).
+    for event_block in inspector.event_blocks:
+        if event_block.name == "Execute":
+            for event in event_block.events:
+                # If user enables profiling and dump intermediate outputs the same time, we need to skip the profiling event
+                if event.perf_data is not None and event.is_delegated_op:
+                    continue
+                print("Event Name: ", event.name)
+                print(event.debug_data)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--etdump_path",
+        required=True,
+        help="Provide an ETDump file path. File extension should be .etdp",
+    )
+    parser.add_argument(
+        "--etrecord_path",
+        required=False,
+        default=None,
+        help="Provide an optional ETRecord file path. File extension should be .bin",
+    )
+    parser.add_argument(
+        "--debug_buffer_path",
+        required=False,
+        default=None,
+        help="Provide an optional debug buffer file path. File extension should be .bin",
+    )
+    args = parser.parse_args()
+
+    main(args)
diff --git a/examples/qualcomm/scripts/deeplab_v3.py b/examples/qualcomm/scripts/deeplab_v3.py
index 8d1aa376e7c..7f24d616182 100755
--- a/examples/qualcomm/scripts/deeplab_v3.py
+++ b/examples/qualcomm/scripts/deeplab_v3.py
@@ -12,6 +12,7 @@
 from multiprocessing.connection import Client
 
 import numpy as np
+import torch
 
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet101Model
@@ -74,9 +75,13 @@ def main(args):
         )
 
     data_num = 100
-    inputs, targets, input_list = get_dataset(
-        data_size=data_num, dataset_dir=args.artifact, download=args.download
-    )
+    if args.compile_only:
+        inputs = [(torch.rand(1, 3, 224, 224),)]
+    else:
+        inputs, targets, input_list = get_dataset(
+            data_size=data_num, dataset_dir=args.artifact, download=args.download
+        )
+
     pte_filename = "dlv3_qnn"
     instance = DeepLabV3ResNet101Model()
 
@@ -95,12 +100,6 @@ def main(args):
     if args.compile_only:
         sys.exit(0)
 
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # build_path : path where QNN delegate artifacts were built
-    # pte_path      : path where executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py
index c5e3f8b0105..8852cf0e4c7 100755
--- a/examples/qualcomm/scripts/edsr.py
+++ b/examples/qualcomm/scripts/edsr.py
@@ -126,12 +126,6 @@ def main(args):
     if args.compile_only:
         sys.exit(0)
 
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # build_path : path where QNN delegate artifacts were built
-    # pte_path      : path where executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
index b12a44993de..08f18d6ac6a 100644
--- a/examples/qualcomm/scripts/export_example.py
+++ b/examples/qualcomm/scripts/export_example.py
@@ -15,12 +15,12 @@
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
 )
+from executorch.devtools import generate_etrecord
 from executorch.examples.models import MODEL_NAME_TO_MODEL
 from executorch.examples.models.model_factory import EagerModelFactory
 from executorch.exir.backend.backend_api import to_backend, validation_disabled
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.extension.export_util.utils import save_pte_program
-from executorch.sdk import generate_etrecord
 
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
@@ -96,9 +96,7 @@
         )
 
     executorch_program = delegated_program.to_executorch(
-        config=ExecutorchBackendConfig(
-            extract_delegate_segments=False, extract_constant_segment=False
-        )
+        config=ExecutorchBackendConfig(extract_delegate_segments=False)
     )
 
     if args.generate_etrecord:
diff --git a/examples/qualcomm/scripts/inception_v3.py b/examples/qualcomm/scripts/inception_v3.py
index 50361938e85..9cc35463d41 100755
--- a/examples/qualcomm/scripts/inception_v3.py
+++ b/examples/qualcomm/scripts/inception_v3.py
@@ -71,10 +71,13 @@ def main(args):
         )
 
     data_num = 100
-    inputs, targets, input_list = get_dataset(
-        dataset_path=f"{args.dataset}",
-        data_size=data_num,
-    )
+    if args.compile_only:
+        inputs = [(torch.rand(1, 3, 224, 224),)]
+    else:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=data_num,
+        )
     pte_filename = "ic3_qnn"
     instance = InceptionV3Model()
     build_executorch_binary(
@@ -92,12 +95,6 @@ def main(args):
     if args.compile_only:
         sys.exit(0)
 
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # build_path : path where QNN delegate artifacts were built
-    # pte_path      : path where executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
@@ -148,7 +145,7 @@ def main(args):
             "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
         ),
         type=str,
-        required=True,
+        required=False,
     )
 
     parser.add_argument(
diff --git a/examples/qualcomm/scripts/inception_v4.py b/examples/qualcomm/scripts/inception_v4.py
index cd4dcb7cd99..9a19de1a37a 100755
--- a/examples/qualcomm/scripts/inception_v4.py
+++ b/examples/qualcomm/scripts/inception_v4.py
@@ -70,10 +70,13 @@ def main(args):
         )
 
     data_num = 100
-    inputs, targets, input_list = get_dataset(
-        dataset_path=f"{args.dataset}",
-        data_size=data_num,
-    )
+    if args.compile_only:
+        inputs = [(torch.rand(1, 3, 224, 224),)]
+    else:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=data_num,
+        )
     pte_filename = "ic4_qnn"
     instance = InceptionV4Model()
     build_executorch_binary(
@@ -91,12 +94,6 @@ def main(args):
     if args.compile_only:
         sys.exit(0)
 
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # build_path : path where QNN delegate artifacts were built
-    # pte_path      : path where executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
@@ -147,7 +144,7 @@ def main(args):
             "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
         ),
         type=str,
-        required=True,
+        required=False,
     )
 
     parser.add_argument(
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
index 94f528dbc35..605bb27d330 100755
--- a/examples/qualcomm/scripts/mobilebert_fine_tune.py
+++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -13,13 +13,24 @@
 
 import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
+    QcomChipset,
+)
+from executorch.backends.qualcomm.utils.utils import (
+    generate_htp_compiler_spec,
+    generate_qnn_executorch_compiler_spec,
+    skip_annotation,
+)
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
+    make_quantizer,
     parse_skip_delegation_node,
+    QnnPartitioner,
     setup_common_args_and_variables,
     SimpleADB,
 )
+from executorch.exir import to_edge
 from transformers import BertTokenizer, MobileBertForSequenceClassification
 
 
@@ -204,8 +215,6 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
             )
 
     model.load_state_dict(
-        # TODO: If possible, it's better to set weights_only to True
-        # https://pytorch.org/docs/stable/generated/torch.load.html
         torch.load(
             (
                 f"{artifacts_dir}/finetuned_mobilebert_epoch_{epochs}.model"
@@ -213,7 +222,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
                 else pretrained_weight
             ),
             map_location=torch.device("cpu"),
-            weights_only=False,
+            weights_only=True,
         ),
     )
 
@@ -232,48 +241,69 @@ def main(args):
             "Please specify a device serial by -s/--device argument."
         )
 
-    pte_filename = "ptq_mb_qnn" if args.ptq else "mb_qnn"
-    batch_size = 1 if args.ptq else 3
+    batch_size, pte_filename = 1, "ptq_mb_qnn"
     model, data_val, labels = get_fine_tuned_mobilebert(
         args.artifact, args.pretrained_weight, batch_size
     )
     inputs, input_list = get_dataset(data_val)
 
-    if args.ptq == "8a8w":
-        quant_dtype = QuantDtype.use_8a8w
-    elif args.ptq == "16a16w":
-        quant_dtype = QuantDtype.use_16a16w
-    elif args.ptq == "16a4w":
-        quant_dtype = QuantDtype.use_16a4w
-    else:
+    try:
+        quant_dtype = getattr(QuantDtype, f"use_{args.ptq}")
+    except:
         raise AssertionError(
             f"No support for quant type {args.ptq}. Support 8a8w, 16a16w and 16a4w."
         )
 
     if args.use_fp16:
         quant_dtype = None
+        pte_filename = "mb_qnn"
+        build_executorch_binary(
+            model,
+            inputs[0],
+            args.model,
+            f"{args.artifact}/{pte_filename}",
+            inputs,
+            skip_node_id_set=skip_node_id_set,
+            skip_node_op_set=skip_node_op_set,
+            quant_dtype=quant_dtype,
+            shared_buffer=args.shared_buffer,
+        )
+    else:
 
-    build_executorch_binary(
-        model,
-        inputs[0],
-        args.model,
-        f"{args.artifact}/{pte_filename}",
-        inputs,
-        skip_node_id_set=skip_node_id_set,
-        skip_node_op_set=skip_node_op_set,
-        quant_dtype=quant_dtype,
-        shared_buffer=args.shared_buffer,
-    )
+        def calibrator(gm):
+            for input in inputs:
+                gm(*input)
+
+        quantizer = make_quantizer(quant_dtype=quant_dtype)
+        backend_options = generate_htp_compiler_spec(quant_dtype is not None)
+        partitioner = QnnPartitioner(
+            generate_qnn_executorch_compiler_spec(
+                soc_model=getattr(QcomChipset, args.model),
+                backend_options=backend_options,
+            ),
+            skip_node_id_set=skip_node_id_set,
+            skip_node_op_set=skip_node_op_set,
+        )
+        # skip embedding layer cause it's quantization sensitive
+        graph_module, _ = skip_annotation(
+            nn_module=model,
+            quantizer=quantizer,
+            partitioner=partitioner,
+            sample_input=inputs[0],
+            calibration_cb=calibrator,
+            fp_node_op_set={torch.ops.aten.embedding.default},
+        )
+        # lower all graph again, the skipped operators will be left in CPU
+        exec_prog = to_edge(
+            torch.export.export(graph_module, inputs[0]),
+        ).to_executorch()
+
+        with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file:
+            file.write(exec_prog.buffer)
 
     if args.compile_only:
         sys.exit(0)
 
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # build_path : path where QNN delegate artifacts were built
-    # pte_path      : path where executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
diff --git a/examples/qualcomm/scripts/mobilenet_v2.py b/examples/qualcomm/scripts/mobilenet_v2.py
index 8a3032df02f..a915e26c6be 100755
--- a/examples/qualcomm/scripts/mobilenet_v2.py
+++ b/examples/qualcomm/scripts/mobilenet_v2.py
@@ -71,10 +71,13 @@ def main(args):
         )
 
     data_num = 100
-    inputs, targets, input_list = get_dataset(
-        dataset_path=f"{args.dataset}",
-        data_size=data_num,
-    )
+    if args.compile_only:
+        inputs = [(torch.rand(1, 3, 224, 224),)]
+    else:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=data_num,
+        )
     pte_filename = "mv2_qnn"
     instance = MV2Model()
     build_executorch_binary(
@@ -92,12 +95,6 @@ def main(args):
     if args.compile_only:
         sys.exit(0)
 
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # build_path : path where QNN delegate artifacts were built
-    # pte_path      : path where executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
@@ -148,7 +145,7 @@ def main(args):
             "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
         ),
         type=str,
-        required=True,
+        required=False,
     )
 
     parser.add_argument(
diff --git a/examples/qualcomm/scripts/mobilenet_v3.py b/examples/qualcomm/scripts/mobilenet_v3.py
index d0cd7bb4df0..068e9cba3a7 100644
--- a/examples/qualcomm/scripts/mobilenet_v3.py
+++ b/examples/qualcomm/scripts/mobilenet_v3.py
@@ -70,10 +70,13 @@ def main(args):
         )
 
     data_num = 100
-    inputs, targets, input_list = get_dataset(
-        dataset_path=f"{args.dataset}",
-        data_size=data_num,
-    )
+    if args.compile_only:
+        inputs = [(torch.rand(1, 3, 224, 224),)]
+    else:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=data_num,
+        )
     pte_filename = "mv3_qnn"
     instance = MV3Model()
     build_executorch_binary(
@@ -90,12 +93,6 @@ def main(args):
     if args.compile_only:
         sys.exit(0)
 
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # build_path : path where QNN delegate artifacts were built
-    # pte_path      : path where executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
@@ -146,7 +143,7 @@ def main(args):
             "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
         ),
         type=str,
-        required=True,
+        required=False,
     )
 
     parser.add_argument(
diff --git a/examples/qualcomm/scripts/torchvision_vit.py b/examples/qualcomm/scripts/torchvision_vit.py
index 85852ebb2fe..c9fc988d560 100755
--- a/examples/qualcomm/scripts/torchvision_vit.py
+++ b/examples/qualcomm/scripts/torchvision_vit.py
@@ -6,6 +6,7 @@
 
 import json
 import os
+import sys
 from multiprocessing.connection import Client
 
 import numpy as np
@@ -61,10 +62,14 @@ def main(args):
     os.makedirs(args.artifact, exist_ok=True)
 
     data_num = 100
-    inputs, targets, input_list = get_dataset(
-        dataset_path=f"{args.dataset}",
-        data_size=data_num,
-    )
+    if args.compile_only:
+        inputs = [(torch.rand(1, 3, 224, 224),)]
+    else:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=data_num,
+        )
+
     pte_filename = "vit_qnn"
     instance = TorchVisionViTModel()
     build_executorch_binary(
@@ -76,12 +81,10 @@ def main(args):
         quant_dtype=QuantDtype.use_8a8w,
         shared_buffer=args.shared_buffer,
     )
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # build_path : path where QNN delegate artifacts were built
-    # pte_path      : path where executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
+
+    if args.compile_only:
+        sys.exit(0)
+
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
@@ -131,13 +134,14 @@ def main(args):
             "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
         ),
         type=str,
-        required=True,
+        required=False,
     )
     parser.add_argument(
         "-a",
         "--artifact",
-        help="path for storing generated artifacts by this example. " "Default ./vit",
-        default="./vit",
+        help="path for storing generated artifacts by this example. "
+        "Default ./torchvision_vit",
+        default="./torchvision_vit",
         type=str,
     )
 
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 20641c6dc84..9c4cd4453f0 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -19,6 +19,7 @@
 from executorch.backends.qualcomm.quantizer.quantizer import (
     get_16a4w_qnn_ptq_config,
     get_default_16bit_qnn_ptq_config,
+    get_default_8bit_qnn_ptq_config,
     QnnQuantizer,
     QuantDtype,
 )
@@ -30,7 +31,7 @@
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
 )
-from executorch.exir import EdgeCompileConfig, EdgeProgramManager
+from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
@@ -66,6 +67,7 @@ def __init__(
         host_id=None,
         error_only=False,
         shared_buffer=False,
+        dump_intermediate_outputs=False,
         runner="examples/qualcomm/executor_runner/qnn_executor_runner",
     ):
         self.qnn_sdk = qnn_sdk
@@ -77,6 +79,8 @@ def __init__(
         self.working_dir = Path(self.pte_path[0]).parent.absolute()
         self.input_list_filename = "input_list.txt"
         self.etdump_path = f"{self.workspace}/etdump.etdp"
+        self.dump_intermediate_outputs = dump_intermediate_outputs
+        self.debug_output_path = f"{self.workspace}/debug_output.bin"
         self.output_folder = f"{self.workspace}/outputs"
         self.arch_table = {
             "SM8650": "75",
@@ -152,13 +156,17 @@ def execute(self, custom_runner_cmd=None):
                     f"--input_list_path {self.input_list_filename}",
                     f"--etdump_path {self.etdump_path}",
                     "--shared_buffer" if self.shared_buffer else "",
+                    f"--debug_output_path {self.debug_output_path}",
+                    (
+                        "--dump_intermediate_outputs"
+                        if self.dump_intermediate_outputs
+                        else ""
+                    ),
                 ]
             )
             qnn_executor_runner_cmds = " ".join(
                 [
                     f"cd {self.workspace} &&",
-                    "export ADSP_LIBRARY_PATH=. &&",
-                    "export LD_LIBRARY_PATH=. &&",
                     f"./qnn_executor_runner {qnn_executor_runner_args}",
                 ]
             )
@@ -177,6 +185,45 @@ def pull_etdump(self, output_path, callback=None):
         if callback:
             callback()
 
+    def pull_debug_output(self, etdump_path, debug_ouput_path, callback=None):
+        self._adb(["pull", self.etdump_path, etdump_path])
+        self._adb(["pull", self.debug_output_path, debug_ouput_path])
+        if callback:
+            callback()
+
+
+def make_quantizer(
+    quant_dtype: Optional[QuantDtype],
+    custom_annotations=(),
+    per_channel_conv=True,
+    per_channel_linear=False,
+    act_observer=MovingAverageMinMaxObserver,
+):
+    quantizer = QnnQuantizer()
+    quantizer.add_custom_quant_annotations(custom_annotations)
+    quantizer.set_per_channel_conv_quant(per_channel_conv)
+    quantizer.set_per_channel_linear_quant(per_channel_linear)
+
+    if quant_dtype == QuantDtype.use_8a8w:
+        quantizer.set_bit8_op_quant_config(
+            get_default_8bit_qnn_ptq_config(act_observer=act_observer)
+        )
+    elif quant_dtype == QuantDtype.use_16a16w:
+        quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS)
+        quantizer.set_bit16_op_quant_config(
+            get_default_16bit_qnn_ptq_config(act_observer=act_observer)
+        )
+    elif quant_dtype == QuantDtype.use_16a4w:
+        quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS)
+        quantizer.set_bit16_op_quant_config(
+            get_16a4w_qnn_ptq_config(act_observer=act_observer)
+        )
+        quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4")
+    else:
+        raise AssertionError(f"No support for QuantDtype {quant_dtype}.")
+
+    return quantizer
+
 
 # TODO: refactor to support different backends
 def build_executorch_binary(
@@ -193,29 +240,16 @@ def build_executorch_binary(
     shared_buffer=False,
     metadata=None,
     act_observer=MovingAverageMinMaxObserver,
+    dump_intermediate_outputs=False,
 ):
     if quant_dtype is not None:
-        quantizer = QnnQuantizer()
-        quantizer.add_custom_quant_annotations(custom_annotations)
-        quantizer.set_per_channel_linear_quant(per_channel_linear)
-        quantizer.set_per_channel_conv_quant(True)
-
-        if quant_dtype == QuantDtype.use_8a8w:
-            pass  # default setting
-        elif quant_dtype == QuantDtype.use_16a16w:
-            quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS)
-            quantizer.set_bit16_op_quant_config(
-                get_default_16bit_qnn_ptq_config(act_observer=act_observer)
-            )
-        elif quant_dtype == QuantDtype.use_16a4w:
-            quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS)
-            quantizer.set_bit16_op_quant_config(
-                get_16a4w_qnn_ptq_config(act_observer=act_observer)
-            )
-            quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4")
-        else:
-            raise AssertionError(f"No support for QuantDtype {quant_dtype}.")
-
+        quantizer = make_quantizer(
+            quant_dtype=quant_dtype,
+            custom_annotations=custom_annotations,
+            per_channel_conv=True,
+            per_channel_linear=per_channel_linear,
+            act_observer=act_observer,
+        )
         captured_model = torch.export.export(model, inputs).module()
         annotated_model = prepare_pt2e(captured_model, quantizer)
         print("Quantizing the model...")
@@ -225,54 +259,41 @@ def build_executorch_binary(
         else:
             for data in dataset:
                 annotated_model(*data)
+
         quantized_model = convert_pt2e(annotated_model)
         edge_prog = capture_program(quantized_model, inputs)
     else:
         edge_prog = capture_program(model, inputs)
 
-    arch_table = {
-        "SM8650": QcomChipset.SM8650,
-        "SM8550": QcomChipset.SM8550,
-        "SM8475": QcomChipset.SM8475,
-        "SM8450": QcomChipset.SM8450,
-    }
-
     backend_options = generate_htp_compiler_spec(
         use_fp16=False if quant_dtype else True
     )
     qnn_partitioner = QnnPartitioner(
         generate_qnn_executorch_compiler_spec(
-            soc_model=arch_table[soc_model],
+            soc_model=getattr(QcomChipset, soc_model),
             backend_options=backend_options,
-            debug=False,
-            saver=False,
             shared_buffer=shared_buffer,
-            profile=False,
+            dump_intermediate_outputs=dump_intermediate_outputs,
         ),
         skip_node_id_set,
         skip_node_op_set,
     )
 
     executorch_config = ExecutorchBackendConfig(
-        extract_constant_segment=False,
         # For shared buffer, user must pass the memory address
         # which is allocated by RPC memory to executor runner.
         # Therefore, won't want to pre-allocate
         # by memory manager in runtime.
         memory_planning_pass=MemoryPlanningPass(
-            memory_planning_algo="greedy",
             alloc_graph_input=not shared_buffer,
             alloc_graph_output=not shared_buffer,
         ),
-        extract_delegate_segments=True,
     )
 
     if metadata is None:
-        edge_prog.exported_program = to_backend(
-            edge_prog.exported_program, qnn_partitioner
-        )
-        edge_prog.exported_program.graph_module.graph.print_tabular()
-        exec_prog = edge_prog.to_executorch(config=executorch_config)
+        exported_program = to_backend(edge_prog.exported_program, qnn_partitioner)
+        exported_program.graph_module.graph.print_tabular()
+        exec_prog = to_edge(exported_program).to_executorch(config=executorch_config)
         with open(f"{file_name}.pte", "wb") as file:
             file.write(exec_prog.buffer)
     else:
@@ -353,7 +374,7 @@ def setup_common_args_and_variables():
     parser.add_argument(
         "-b",
         "--build_folder",
-        help="path to cmake binary directory for android, e.g., /path/to/cmake-out-android",
+        help="path to cmake binary directory for android, e.g., /path/to/build-android",
         type=str,
         required=True,
     )
@@ -418,20 +439,26 @@ def setup_common_args_and_variables():
         action="store_true",
     )
 
+    parser.add_argument(
+        "--skip_push",
+        help="If specified, skip pushing files to device.",
+        action="store_true",
+        default=False,
+    )
+
+    parser.add_argument(
+        "--dump_intermediate_outputs",
+        help="If specified, enable dump intermediate outputs",
+        action="store_true",
+        default=False,
+    )
+
     # QNN_SDK_ROOT might also be an argument, but it is used in various places.
     # So maybe it's fine to just use the environment.
     if "QNN_SDK_ROOT" not in os.environ:
         raise RuntimeError("Environment variable QNN_SDK_ROOT must be set")
     print(f"QNN_SDK_ROOT={os.getenv('QNN_SDK_ROOT')}")
 
-    if "LD_LIBRARY_PATH" not in os.environ:
-        print(
-            "[Warning] LD_LIBRARY_PATH is not set. If errors like libQnnHtp.so "
-            "not found happen, please follow setup.md to set environment."
-        )
-    else:
-        print(f"LD_LIBRARY_PATH={os.getenv('LD_LIBRARY_PATH')}")
-
     return parser
 
 
diff --git a/examples/selective_build/test_selective_build.sh b/examples/selective_build/test_selective_build.sh
index 72333ac23cc..fd2ae421e22 100644
--- a/examples/selective_build/test_selective_build.sh
+++ b/examples/selective_build/test_selective_build.sh
@@ -48,9 +48,9 @@ test_buck2_select_ops_in_list() {
     ${PYTHON_EXECUTABLE} -m examples.portable.scripts.export --model_name="add_mul"
 
     echo "Running selective build test"
-    # set max_kernel_num=20: 18 primops, add, mul
+    # set max_kernel_num=21: 19 primops, add, mul
     $BUCK run //examples/selective_build:selective_build_test \
-        --config=executorch.max_kernel_num=20 \
+        --config=executorch.max_kernel_num=21 \
         --config=executorch.select_ops=list \
         -- --model_path=./add_mul.pte
 
@@ -117,11 +117,11 @@ test_cmake_select_ops_in_list() {
 
     local example_dir=examples/selective_build
     local build_dir=cmake-out/${example_dir}
-    # set MAX_KERNEL_NUM=20: 18 primops, add, mul
+    # set MAX_KERNEL_NUM=21: 19 primops, add, mul
     rm -rf ${build_dir}
     retry cmake -DBUCK2="$BUCK" \
             -DCMAKE_BUILD_TYPE=Release \
-            -DMAX_KERNEL_NUM=20 \
+            -DMAX_KERNEL_NUM=21 \
             -DEXECUTORCH_SELECT_OPS_LIST="aten::convolution.out,\
 aten::_native_batch_norm_legit_no_training.out,aten::hardtanh.out,aten::add.out,\
 aten::mean.out,aten::view_copy.out,aten::permute_copy.out,aten::addmm.out,\
diff --git a/examples/xnnpack/README.md b/examples/xnnpack/README.md
index 61c14b5c7e4..dcd5b9c5d70 100644
--- a/examples/xnnpack/README.md
+++ b/examples/xnnpack/README.md
@@ -38,9 +38,10 @@ mkdir cmake-out
 cmake \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DPYTHON_EXECUTABLE=python \
     -Bcmake-out .
@@ -92,9 +93,10 @@ mkdir cmake-out
 cmake \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DPYTHON_EXECUTABLE=python \
     -Bcmake-out .
diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py
index a816c4f0e74..520aa82d7cf 100644
--- a/examples/xnnpack/aot_compiler.py
+++ b/examples/xnnpack/aot_compiler.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 # Example script for exporting simple models to flatbuffer
 
 import argparse
@@ -12,9 +14,9 @@
 
 import torch
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.devtools import generate_etrecord
 from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig
 from executorch.extension.export_util.utils import export_to_edge, save_pte_program
-from executorch.sdk import generate_etrecord
 
 from ..models import MODEL_NAME_TO_MODEL
 from ..models.model_factory import EagerModelFactory
@@ -79,7 +81,7 @@
 
     model = model.eval()
     # pre-autograd export. eventually this will become torch.export
-    model = torch._export.capture_pre_autograd_graph(model, example_inputs)
+    model = torch.export.export_for_training(model, example_inputs).module()
 
     if args.quantize:
         logging.info("Quantizing Model...")
@@ -103,9 +105,7 @@
     logging.info(f"Lowered graph:\n{edge.exported_program().graph}")
 
     exec_prog = edge.to_executorch(
-        config=ExecutorchBackendConfig(
-            extract_delegate_segments=False, extract_constant_segment=False
-        )
+        config=ExecutorchBackendConfig(extract_delegate_segments=False)
     )
 
     if args.etrecord is not None:
diff --git a/examples/xnnpack/quantization/example.py b/examples/xnnpack/quantization/example.py
index e64c171f6d4..e5453842281 100644
--- a/examples/xnnpack/quantization/example.py
+++ b/examples/xnnpack/quantization/example.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import argparse
 import copy
 import logging
@@ -58,7 +60,7 @@ def verify_xnnpack_quantizer_matching_fx_quant_model(model_name, model, example_
     m = model
 
     # 1. pytorch 2.0 export quantization flow (recommended/default flow)
-    m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs))
+    m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module()
     quantizer = XNNPACKQuantizer()
     quantization_config = get_symmetric_quantization_config(is_per_channel=True)
     quantizer.set_global(quantization_config)
@@ -175,7 +177,7 @@ def main() -> None:
 
     model = model.eval()
     # pre-autograd export. eventually this will become torch.export
-    model = torch._export.capture_pre_autograd_graph(model, example_inputs)
+    model = torch.export.export_for_training(model, example_inputs).module()
     start = time.perf_counter()
     quantized_model = quantize(model, example_inputs)
     end = time.perf_counter()
@@ -191,9 +193,7 @@ def main() -> None:
 
     start = time.perf_counter()
     prog = edge_m.to_executorch(
-        config=ExecutorchBackendConfig(
-            extract_delegate_segments=False, extract_constant_segment=False
-        )
+        config=ExecutorchBackendConfig(extract_delegate_segments=False)
     )
     save_pte_program(prog, f"{args.model_name}_quantized")
     end = time.perf_counter()
diff --git a/examples/xnnpack/targets.bzl b/examples/xnnpack/targets.bzl
index 30cafa56fa9..35df8999b47 100644
--- a/examples/xnnpack/targets.bzl
+++ b/examples/xnnpack/targets.bzl
@@ -32,7 +32,7 @@ def define_common_targets():
             "//executorch/examples/xnnpack/quantization:quant_utils",
             "//executorch/exir:lib",
             "//executorch/exir/backend:backend_api",
-            "//executorch/sdk:lib",
+            "//executorch/devtools:lib",
         ],
     )
 
diff --git a/exir/_serialize/TARGETS b/exir/_serialize/TARGETS
index 8ddf8300395..49419a4159c 100644
--- a/exir/_serialize/TARGETS
+++ b/exir/_serialize/TARGETS
@@ -14,8 +14,8 @@ cpp_python_extension(
         "//executorch/backends/fb/qnnpack/...",
         "//executorch/backends/vulkan/...",
         "//executorch/backends/xnnpack/...",
-        "//executorch/sdk/bundled_program/...",
-        "//executorch/sdk/etdump/...",
+        "//executorch/devtools/bundled_program/...",
+        "//executorch/devtools/etdump/...",
     ],
     deps = [
         "fbsource//third-party/flatbuffers:flatc_library",
@@ -45,6 +45,10 @@ runtime.python_library(
     visibility = [
         "//executorch/backends/...",
         "//executorch/codegen/...",
+        "//executorch/devtools:lib",
+        "//executorch/devtools/bundled_program/serialize:lib",
+        "//executorch/devtools/bundled_program/tests/...",
+        "//executorch/devtools/experimental/...",
         "//executorch/examples/async_exec:emit_program_lib",
         "//executorch/exir/...",
         "//executorch/exir/tests/...",
@@ -52,10 +56,6 @@ runtime.python_library(
         "//executorch/extension/pybindings/test:test",
         "//executorch/extension/pybindings/test:test-library",
         "//executorch/profiler/...",
-        "//executorch/sdk:lib",
-        "//executorch/sdk/bundled_program/serialize:lib",
-        "//executorch/sdk/bundled_program/tests/...",
-        "//executorch/sdk/experimental/...",
         "//executorch/test/...",
         "@EXECUTORCH_CLIENTS",
     ],
diff --git a/exir/_serialize/_dataclass.py b/exir/_serialize/_dataclass.py
index 8f6ef1c172b..013d733bcda 100644
--- a/exir/_serialize/_dataclass.py
+++ b/exir/_serialize/_dataclass.py
@@ -129,6 +129,13 @@ class Example
             data[key] = [_json_to_dataclass(e, T) for e in value]
             continue
 
+        # If T is a Union, then check which type in the Union it is and initialize.
+        # eg. Double type in schema.py
+        if get_origin(T) is Union:
+            res = [x for x in get_args(get_type_hints(cls)[key]) if x == type(value)]
+            data[key] = res[0](value)
+            continue
+
         # If T is an enum then lookup the value in the enum otherwise try to
         # cast value to whatever type is required
         if isinstance(T, enum.EnumMeta):
diff --git a/exir/_serialize/_flatbuffer.py b/exir/_serialize/_flatbuffer.py
index 93006612c73..4599249f00c 100644
--- a/exir/_serialize/_flatbuffer.py
+++ b/exir/_serialize/_flatbuffer.py
@@ -29,14 +29,6 @@ def _is_valid_alignment(alignment: int) -> bool:
     return alignment > 0 and (alignment & (alignment - 1)) == 0
 
 
-# TODO(T182299196): Replace this hack with a proper flatc binary.
-def _replace_infinity_in_json_file(content: str) -> str:
-    content = re.sub(
-        r'"double_val"\s*:\s*(-)?Infinity', r'"double_val": "\g<1>inf"', content
-    )
-    return content
-
-
 def _patch_schema_alignment(
     schema: bytes,
     constant_tensor_alignment: Optional[int],
@@ -291,11 +283,8 @@ def _program_json_to_flatbuffer(
         json_path = os.path.join(temp_dir, file_stem + ".json")
         output_path = os.path.join(temp_dir, file_stem + ".pte")
 
-        # TODO(T182299196): Replace this hack with a proper flatc binary.
-        replaced_program_json = _replace_infinity_in_json_file(program_json)
-
         with open(json_path, "wb") as json_file:
-            json_file.write(replaced_program_json.encode("ascii"))
+            json_file.write(program_json.encode("ascii"))
 
         try:
             _flatc_compile(temp_dir, schema_info.root_path, json_path)
@@ -330,6 +319,19 @@ def _program_json_to_flatbuffer(
             )
 
 
+def _replace_infinity_in_json_file(content: bytes) -> bytes:
+    """Replace -inf and inf with "inf" and "-inf" in the JSON file. program.fbs
+    is used to convert from flatbuffer to JSON. +-inf float values are not
+    supported by JSON, so we replace them with the string equivalent. When
+    converting from JSON to python dataclasses, the string is read as a Union
+    of float and string (see schema.py).
+    """
+    content = re.sub(
+        rb'"double_val"\s*:\s*(-)?inf', rb'"double_val": "\g<1>inf"', content
+    )
+    return content
+
+
 def _program_flatbuffer_to_json(program_flatbuffer: bytes) -> bytes:
     """Converts binary flatbuffer data into Program-compatible JSON.
 
@@ -348,4 +350,5 @@ def _program_flatbuffer_to_json(program_flatbuffer: bytes) -> bytes:
 
         _flatc_decompile(temp_dir, schema_info.root_path, bin_path)
         with open(json_path, "rb") as output_file:
-            return output_file.read()
+            json_data = output_file.read()
+            return _replace_infinity_in_json_file(json_data)
diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py
index a82b947cec3..00a3d4700f0 100644
--- a/exir/_serialize/_program.py
+++ b/exir/_serialize/_program.py
@@ -347,8 +347,7 @@ def serialize_pte_binary(
     *,
     mutable_data: Optional[List[Buffer]] = None,
     extract_delegate_segments: bool = False,
-    extract_constant_segment: bool = False,
-    segment_alignment: int = 4096,
+    segment_alignment: int = 128,
     constant_tensor_alignment: Optional[int] = None,
     delegate_alignment: Optional[int] = None,
 ) -> Cord:
@@ -363,8 +362,6 @@ def serialize_pte_binary(
               and the starting segment offset.
             - Update the Program.segments field with the offsets and lengths
               of each segment.
-        extract_constant_segment: Whether to move the constant data from the Program
-            into a separate segment.
         segment_alignment: Alignment in bytes. The starting offset of each
             segment will be aligned to this value in the output data.
         constant_tensor_alignment: The minimum alignment of tensor
@@ -387,19 +384,23 @@ def serialize_pte_binary(
     # Store extracted segment data; this may be constant data or delegate data.
     segments: List[Cord] = []
 
-    if extract_constant_segment:
-        constant_segment_data, constant_segment_offsets = _extract_constant_segment(
-            program.constant_buffer, tensor_alignment=constant_tensor_alignment
+    constant_segment_data, constant_segment_offsets = _extract_constant_segment(
+        program.constant_buffer, tensor_alignment=constant_tensor_alignment
+    )
+
+    # If there are no constants, len(constant_segment_data) = 0. However, there may
+    # be non-constants, in which case len(constant_segment_offsets) = 1, containing
+    # the placeholder value 0. Ensure the placeholder value is put into
+    # program.constant_segment.offsets.
+    if len(constant_segment_offsets) > 0:
+        # Update program.constant_segment with constant subsegment offset information.
+        program.constant_segment = SubsegmentOffsets(
+            segment_index=len(segments), offsets=constant_segment_offsets
         )
-        if len(constant_segment_data) > 0:
-            # Update program.constant_segment with constant subsegment offset information.
-            program.constant_segment = SubsegmentOffsets(
-                segment_index=len(segments), offsets=constant_segment_offsets
-            )
-            # Clear the constant buffer, as constant data will be stored in segments.
-            program.constant_buffer = []
-            # Add to the aggregate segments cord.
-            segments.append(constant_segment_data)
+        # Clear the constant buffer, as constant data will be stored in segments.
+        program.constant_buffer = []
+        # Add to the aggregate segments cord.
+        segments.append(constant_segment_data)
 
     if mutable_data is not None:
         mutable_segment_data, mutable_segment_offsets = _extract_constant_segment(
@@ -552,6 +553,24 @@ def _restore_segments(program: Program, segment_data: bytes) -> Program:
                 location=DataLocation.INLINE, index=data_index
             )
 
+    # Replace constants from constant_segment into constant_buffer.
+    if program.constant_segment and len(program.constant_segment.offsets) > 0:
+        buffers: List[Buffer] = []
+        constant_segment = segments[program.constant_segment.segment_index]
+        for i in range(len(program.constant_segment.offsets)):
+            start_offset = program.constant_segment.offsets[i]
+            # Note: this is the original end offset plus any padding between
+            # it and the next start offset.
+            end_offset = (
+                program.constant_segment.offsets[i + 1]
+                if i < len(program.constant_segment.offsets) - 1
+                else len(constant_segment)
+            )
+            buffers.append(Buffer(storage=constant_segment[start_offset:end_offset]))
+        program.constant_buffer = buffers
+        program.constant_segment.segment_index = 0
+        program.constant_segment.offsets = []
+
     # Clear out the segments list since the original Program didn't have one.
     program.segments = []
     return program
diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py
index 54f8c7b6225..f20c0b39798 100644
--- a/exir/_serialize/test/test_program.py
+++ b/exir/_serialize/test/test_program.py
@@ -5,6 +5,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import copy
 import difflib
 import json
@@ -36,7 +38,7 @@
 )
 from executorch.exir.tests.common import get_test_program
 
-SEGMENT_ALIGNMENT: int = 4096
+SEGMENT_ALIGNMENT: int = 128
 
 CONSTANT_TENSOR_ALIGNMENT: int = 16
 
@@ -167,7 +169,6 @@ def constant_segment_with_tensor_alignment(
         pte_data = bytes(
             serialize_pte_binary(
                 program,
-                extract_constant_segment=True,
                 segment_alignment=SEGMENT_ALIGNMENT,
                 constant_tensor_alignment=constant_tensor_alignment,
             )
@@ -271,6 +272,15 @@ def constant_segment_with_tensor_alignment(
             f"{segment_table}",
         )
 
+        # Convert back.
+        program2 = deserialize_pte_binary(pte_data)
+        # Programs are the same besides constant_buffer, as deserialization
+        # does not preserve constant segment; padding may be added
+        # during serialization.
+        self.assertEqual(program2.execution_plan, program.execution_plan)
+        # Number of constant tensors should be the same.
+        self.assertEqual(len(program2.constant_buffer), len(program.constant_buffer))
+
     def test_canonicalize_delegate_indices(self) -> None:
         def make_execution_plan(
             name: str, delegates: List[BackendDelegate]
@@ -425,16 +435,12 @@ def test_round_trip_large_buffer_sizes(self) -> None:
 
     def test_round_trip_no_segments_and_no_header(self) -> None:
         """Tests that a Program serialized with extract_delegate_segments=True
-        or extract_constant_segment=True, when there are no segments, does not
-        contain an extended header, constant segment, or delegate segments. Confirm
-        that a Program remains the same after serializing and deserializing.
+        when there are no segments does not contain an extended header,
+        constant segment, or delegate segments. Confirm that a Program remains
+        the same after serializing and deserializing.
         """
         program = get_test_program()
-        pte_data = bytes(
-            serialize_pte_binary(
-                program, extract_delegate_segments=True, extract_constant_segment=True
-            )
-        )
+        pte_data = bytes(serialize_pte_binary(program, extract_delegate_segments=True))
         self.assertGreater(len(pte_data), 16)
 
         # File magic should be present at the expected offset.
@@ -465,7 +471,6 @@ def gen_blob_data(size: int, pattern: bytes) -> bytes:
         assert len(ret) == size
         return ret
 
-    @unittest.skip("TODO(T181362263): Update restore segments to restore cords")
     def test_round_trip_with_segments(self) -> None:
         # Create a program with some delegate data blobs.
         program = get_test_program()
@@ -586,6 +591,33 @@ def test_round_trip_with_segments(self) -> None:
         program2 = deserialize_pte_binary(pte_data)
         self.assert_programs_equal(program, program2)
 
+    def test_no_constants(self) -> None:
+        program = get_test_program()
+        # Insert placeholder for non-const tensors.
+        add_constant_data(program, [b""])
+
+        pte_data = bytes(
+            serialize_pte_binary(
+                program,
+                extract_delegate_segments=True,
+                segment_alignment=SEGMENT_ALIGNMENT,
+                constant_tensor_alignment=CONSTANT_TENSOR_ALIGNMENT,
+            )
+        )
+        # The input Program should not be modified.
+        self.assertEqual(program.segments, [])
+
+        # Peek inside the actual flatbuffer data to see the segments.
+        flatbuffer_program = _json_to_program(_program_flatbuffer_to_json(pte_data))
+
+        # Constant buffer should be empty.
+        self.assertEqual(len(flatbuffer_program.constant_buffer), 0)
+
+        # Constant segment should contain the placeholder.
+        self.assertEqual(flatbuffer_program.constant_segment.segment_index, 0)
+        self.assertEqual(len(flatbuffer_program.constant_segment.offsets), 1)
+        self.assertEqual(flatbuffer_program.constant_segment.offsets[0], 0)
+
     def test_unused_inline_delegate_blobs_with_segments(self) -> None:
         # Create a program with some delegate data blobs.
         program = get_test_program()
@@ -635,7 +667,6 @@ def test_constant_segment_tensor_alignment_non_power_of_2_fails(self) -> None:
         with self.assertRaises(ValueError):
             serialize_pte_binary(
                 program,
-                extract_constant_segment=True,
                 segment_alignment=SEGMENT_ALIGNMENT,
                 constant_tensor_alignment=constant_tensor_alignment,
             )
@@ -660,7 +691,6 @@ def test_constant_segment_and_delegate_segment(self) -> None:
             serialize_pte_binary(
                 program,
                 extract_delegate_segments=True,
-                extract_constant_segment=True,
                 segment_alignment=SEGMENT_ALIGNMENT,
                 constant_tensor_alignment=CONSTANT_TENSOR_ALIGNMENT,
             )
@@ -781,6 +811,15 @@ def test_constant_segment_and_delegate_segment(self) -> None:
             + b"\x40\x44\x44",
         )
 
+        # Convert back.
+        program2 = deserialize_pte_binary(pte_data)
+        # Programs are the same besides constant_buffer, as deserialization
+        # does not preserve constant segment; padding may be added
+        # during serialization.
+        self.assertEqual(program2.execution_plan, program.execution_plan)
+        # Number of constant tensors should be the same.
+        self.assertEqual(len(program2.constant_buffer), len(program.constant_buffer))
+
 
 # Common data for extended header tests. The two example values should produce
 # the example data.
diff --git a/exir/backend/test/TARGETS b/exir/backend/test/TARGETS
index ed58b06b3dd..5c3a5e3eb32 100644
--- a/exir/backend/test/TARGETS
+++ b/exir/backend/test/TARGETS
@@ -82,13 +82,14 @@ python_library(
         "//executorch/test/...",
     ],
     deps = [
-        ":backend_with_compiler_demo",
-        "//caffe2:torch",
-        "//executorch/exir:graph_module",
-        "//executorch/exir/backend:compile_spec_schema",
-        "//executorch/exir/backend:partitioner",
-        "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
-        "//executorch/exir/dialects:lib",
+        "fbcode//caffe2:torch",
+        "fbcode//executorch/exir:graph_module",
+        "fbcode//executorch/exir/backend:compile_spec_schema",
+        "fbcode//executorch/exir/backend:partitioner",
+        "fbcode//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
+        "fbcode//executorch/exir/backend/test:backend_with_compiler_demo",
+        "fbcode//executorch/exir/backend/test/demos/rpc:executor_backend_preprocess",
+        "fbcode//executorch/exir/dialects:lib",
     ],
 )
 
@@ -290,6 +291,7 @@ python_unittest(
         "//executorch/exir/backend/test/demos/rpc:executor_backend_register",
     ],
     deps = [
+        ":op_partitioner_demo",
         "//caffe2:torch",
         "//executorch/exir:lib",
         "//executorch/exir/backend:backend_details",
diff --git a/exir/backend/test/demos/rpc/CMakeLists.txt b/exir/backend/test/demos/rpc/CMakeLists.txt
index cf39248c384..cd1b6e73ff2 100644
--- a/exir/backend/test/demos/rpc/CMakeLists.txt
+++ b/exir/backend/test/demos/rpc/CMakeLists.txt
@@ -26,10 +26,10 @@ include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 
-add_library(executor_backend STATIC ExecutorBackendRegister.cpp ExecutorBackend.cpp)
-target_link_libraries(
-  executor_backend PRIVATE executorch_no_prim_ops
+add_library(
+  executor_backend STATIC ExecutorBackendRegister.cpp ExecutorBackend.cpp
 )
+target_link_libraries(executor_backend PRIVATE executorch_no_prim_ops)
 
 target_include_directories(
   executor_backend PUBLIC ${_common_include_directories}
@@ -38,4 +38,5 @@ install(
   TARGETS executor_backend
   DESTINATION lib
   INCLUDES
-  DESTINATION ${_common_include_directories})
+  DESTINATION ${_common_include_directories}
+)
diff --git a/exir/backend/test/demos/rpc/ExecutorBackend.cpp b/exir/backend/test/demos/rpc/ExecutorBackend.cpp
index ae5bf1d733b..aeef621a271 100644
--- a/exir/backend/test/demos/rpc/ExecutorBackend.cpp
+++ b/exir/backend/test/demos/rpc/ExecutorBackend.cpp
@@ -20,7 +20,6 @@
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
-#include <executorch/util/util.h>
 
 namespace torch {
 namespace executor {
@@ -36,7 +35,7 @@ namespace executor {
  * front-end before having the actual backend ready.
  */
 
-class ExecutorBackend final : public PyTorchBackendInterface {
+class ExecutorBackend final : public ::executorch::runtime::BackendInterface {
  public:
   ~ExecutorBackend() = default;
 
diff --git a/exir/backend/test/demos/rpc/targets.bzl b/exir/backend/test/demos/rpc/targets.bzl
index 71df2d8176e..67935e0e373 100644
--- a/exir/backend/test/demos/rpc/targets.bzl
+++ b/exir/backend/test/demos/rpc/targets.bzl
@@ -27,7 +27,6 @@ def define_common_targets():
             "//executorch/kernels/portable:generated_lib",
             "//executorch/runtime/backend:interface",
             "//executorch/extension/data_loader:buffer_data_loader",
-            "//executorch/util:util",
         ] + MODELS_ATEN_OPS_LEAN_MODE_GENERATED_LIB,
         exported_deps = [
             "//executorch/runtime/core:core",
diff --git a/exir/backend/test/op_partitioner_demo.py b/exir/backend/test/op_partitioner_demo.py
index dc20c03e68b..62a0aeb782c 100644
--- a/exir/backend/test/op_partitioner_demo.py
+++ b/exir/backend/test/op_partitioner_demo.py
@@ -21,6 +21,9 @@
 from executorch.exir.backend.test.backend_with_compiler_demo import (
     BackendWithCompilerDemo,
 )
+from executorch.exir.backend.test.demos.rpc.executor_backend_preprocess import (
+    ExecutorBackend,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.graph_module import get_control_flow_submodules
 from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
@@ -29,6 +32,11 @@
 from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
 
 
+class AllOperatorSupport(OperatorSupportBase):
+    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+        return node.op == "call_function"
+
+
 class AddOperatorSupport(OperatorSupportBase):
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
         return node.op == "call_function" and node.target in [
@@ -126,6 +134,48 @@ def partition(self, edge_exported_program: ExportedProgram) -> PartitionResult:
         )
 
 
+@final
+class AllNodesPartitionerDemo(Partitioner):
+    """
+    Partitions all nodes
+    """
+
+    def __init__(self) -> None:
+        self.op_support = AllOperatorSupport()
+        self.delegation_spec = DelegationSpec(ExecutorBackend.__name__, [])
+
+    def partition(self, edge_exported_program: ExportedProgram) -> PartitionResult:
+        partition_tags = {}
+        partition_list = generate_pattern_op_partitions(
+            edge_exported_program.graph_module, op_support=self.op_support
+        )
+        for partition in partition_list:
+            for node in partition.nodes:
+                delegation_tag = f"tag{partition.id}"
+                partition_tags[delegation_tag] = self.delegation_spec
+
+                # Tag the add nodes
+                node.meta["delegation_tag"] = delegation_tag
+
+                for arg_node in node.args:
+                    if not isinstance(arg_node, torch.fx.Node):
+                        continue
+
+                    is_get_attr = arg_node.op == "get_attr"
+                    is_param_buffer = arg_node.op == "placeholder" and (
+                        is_param(edge_exported_program, arg_node)
+                        or is_buffer(edge_exported_program, arg_node)
+                        or is_lifted_tensor_constant(edge_exported_program, arg_node)
+                    )
+                    if is_get_attr or is_param_buffer:
+                        arg_node.meta["delegation_tag"] = delegation_tag
+                    # Add to the list of partitioned nodes.
+
+        return PartitionResult(
+            tagged_exported_program=edge_exported_program, partition_tags=partition_tags
+        )
+
+
 ops_not_to_decompose = [
     torch.ops.aten.linear.default,
     torch.ops.aten.scaled_dot_product_attention.default,
diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py
index 3ee6202ae8e..da1ae0444dd 100644
--- a/exir/backend/test/test_partitioner.py
+++ b/exir/backend/test/test_partitioner.py
@@ -26,6 +26,10 @@
 from executorch.exir.backend.test.demos.rpc.executor_backend_preprocess import (
     ExecutorBackend,
 )
+from executorch.exir.backend.test.op_partitioner_demo import (
+    AddAttributePartitionerDemo,
+    AllNodesPartitionerDemo,
+)
 from executorch.exir.backend.utils import get_delegates, tag_constant_data
 
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -35,9 +39,8 @@
     _load_for_executorch_from_buffer,
 )
 from executorch.extension.pytree import tree_flatten
-from torch._export import capture_pre_autograd_graph
 from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
-from torch.export import export
+from torch.export import export, export_for_training
 from torch.fx.passes.operator_support import any_chain
 
 
@@ -73,7 +76,7 @@ def partition(
 
         mlp = MLP()
         example_inputs = mlp.get_random_inputs()
-        model = capture_pre_autograd_graph(mlp, example_inputs)
+        model = export_for_training(mlp, example_inputs).module()
         aten = export(model, example_inputs)
         spec_key = "path"
         spec_value = "/a/b/c/d"
@@ -134,7 +137,7 @@ def partition(
 
         mlp = MLP()
         example_inputs = mlp.get_random_inputs()
-        model = capture_pre_autograd_graph(mlp, example_inputs)
+        model = export_for_training(mlp, example_inputs).module()
         aten = export(model, example_inputs)
         edge = exir.to_edge(aten)
 
@@ -174,7 +177,7 @@ def partition(
 
         mlp = MLP()
         example_inputs = mlp.get_random_inputs()
-        model = capture_pre_autograd_graph(mlp, example_inputs)
+        model = export_for_training(mlp, example_inputs).module()
         edge = exir.to_edge(export(model, example_inputs))
 
         with self.assertRaisesRegex(
@@ -226,7 +229,7 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = capture_pre_autograd_graph(self.AddConst(), (torch.ones(2, 2),))
+        model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),)))
         delegated = edge.to_backend(PartitionerNoTagData())
 
@@ -305,7 +308,7 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = capture_pre_autograd_graph(self.AddConst(), (torch.ones(2, 2),))
+        model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),)))
         delegated = edge.to_backend(PartitionerTagData())
 
@@ -380,7 +383,7 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = capture_pre_autograd_graph(self.AddConst(), (torch.ones(2, 2),))
+        model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),)))
         delegated = edge.to_backend(PartitionerTagData())
 
@@ -468,7 +471,7 @@ def partition(
                 )
 
         inputs = (torch.ones(2, 2),)
-        model = capture_pre_autograd_graph(ReuseConstData(), (torch.ones(2, 2),))
+        model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),)))
         exec_prog = edge.to_backend(PartitionerTagData()).to_executorch()
         executorch_module = _load_for_executorch_from_buffer(exec_prog.buffer)
@@ -528,7 +531,7 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = capture_pre_autograd_graph(ReuseConstData(), (torch.ones(2, 2),))
+        model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),)))
         with self.assertRaises(RuntimeError) as error:
             _ = edge.to_backend(PartitionerTagData())
@@ -619,3 +622,111 @@ def partition(
             and node.target == torch.ops.aten.copy_.default
         ]
         self.assertEqual(len(copy_node), 1)
+
+    def test_buffer_mutation1(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("b", torch.ones(3, 3))
+
+            def forward(self, x):
+                self.b.add_(x)
+                return x + self.b
+
+        model_inputs = (torch.ones(3, 3),)
+        orig_res = TestModule()(*model_inputs)
+        edge_program = exir.to_edge(torch.export.export(TestModule(), model_inputs))
+        lowered = edge_program.to_backend(AddAttributePartitionerDemo())
+
+        self.assertTrue(
+            torch.allclose(lowered.exported_program().module()(*model_inputs), orig_res)
+        )
+
+        self.assertEqual(
+            len(lowered.exported_program().graph_signature.buffers_to_mutate),
+            0,
+        )
+        lowered_module_nodes = get_delegates(lowered.exported_program().graph)
+        self.assertEqual(len(lowered_module_nodes), 1)
+        lowered_module_node = lowered_module_nodes[0]
+
+        # get call delegate node
+        call_delegate_node = list(lowered_module_node.users.keys())[0]
+        self.assertEqual(len(call_delegate_node.args), 2)
+
+        lower_module = getattr(
+            lowered.exported_program().graph_module, lowered_module_node.name
+        )
+        delegated_ep = lower_module.original_module
+
+        self.assertEqual(len(delegated_ep.state_dict), 1)
+        self.assertEqual(len(delegated_ep.graph_signature.buffers_to_mutate), 1)
+        self.assertEqual(len(delegated_ep.graph_signature.buffers), 1)
+
+    def test_buffer_mutation_llama_repro(self):
+        SHAPE = (2, 3)
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("cache", torch.zeros(SHAPE, dtype=torch.float32))
+
+            def forward(self, q, k_val, input_pos):
+                q_T = q.transpose(0, 1)
+                k = torch.ops.aten.index_put_(self.cache, [input_pos, None], k_val)
+                attn = k.mm(q_T)
+                return attn
+
+        q = torch.rand(1, 3)
+        k = torch.rand(1, 3)
+        example_inputs = (q, k, torch.tensor([1, 1]))
+
+        model = Model()
+        model.eval()
+
+        exir_program_aten = torch.export.export(model, example_inputs)
+        exir_program_aten.module()(*example_inputs)
+        edge_program_manager = exir.to_edge(exir_program_aten)
+        lowered = edge_program_manager.to_backend(AllNodesPartitionerDemo())
+
+        self.assertEqual(
+            len(lowered.exported_program().graph_signature.buffers_to_mutate),
+            0,
+        )
+        lowered_module_nodes = get_delegates(lowered.exported_program().graph)
+        self.assertEqual(len(lowered_module_nodes), 1)
+        lowered_module_node = lowered_module_nodes[0]
+
+        # get call delegate node
+        call_delegate_node = list(lowered_module_node.users.keys())[0]
+        self.assertEqual(len(call_delegate_node.args), 4)
+
+        lower_module = getattr(
+            lowered.exported_program().graph_module, lowered_module_node.name
+        )
+        delegated_ep = lower_module.original_module
+
+        self.assertEqual(len(delegated_ep.state_dict), 1)
+        self.assertEqual(len(delegated_ep.graph_signature.buffers_to_mutate), 1)
+        self.assertEqual(len(delegated_ep.graph_signature.buffers), 1)
+
+    def test_buffer_mutation_unsupported(self):
+        SHAPE = (2, 3)
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("state_1", torch.zeros(SHAPE, dtype=torch.float32))
+
+            def forward(self, x):
+                add = self.state_1.add_(x)
+                return add
+
+        model = Model()
+        model.eval()
+
+        example_inputs = (torch.randn(SHAPE),)
+        exir_program_aten = torch.export.export(model, example_inputs)
+        edge_program_manager = exir.to_edge(exir_program_aten)
+        with self.assertRaises(AssertionError):
+            edge_program_manager.to_backend(AddAttributePartitionerDemo())
diff --git a/exir/backend/test/test_passes.py b/exir/backend/test/test_passes.py
index 8a43431520d..4dcc7757faa 100644
--- a/exir/backend/test/test_passes.py
+++ b/exir/backend/test/test_passes.py
@@ -11,8 +11,8 @@
 from executorch.exir.backend.canonical_partitioners.duplicate_constant_node_pass import (
     duplicate_constant_node,
 )
-from torch._export import capture_pre_autograd_graph
 from torch._export.utils import is_buffer
+from torch.export import export_for_training
 from torch.testing import FileCheck
 
 
@@ -29,7 +29,7 @@ def forward(self, x):
                 z = x - self.const
                 return y, z
 
-        model = capture_pre_autograd_graph(ReuseConstData(), (torch.ones(2, 2),))
+        model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module()
         edge = exir.to_edge(torch.export.export(model, (torch.ones(2, 2),)))
 
         const_nodes = [
diff --git a/exir/backend/utils.py b/exir/backend/utils.py
index b5072604d2d..fb5e16c6bd0 100644
--- a/exir/backend/utils.py
+++ b/exir/backend/utils.py
@@ -28,9 +28,6 @@
 T_DQuantPerTensor = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
 
 
-log: logging.Logger = logging.getLogger(__name__)
-
-
 # NB: Set this to None to handle validation from MobileBert
 @lru_cache(maxsize=None)
 def is_same_node(
@@ -386,6 +383,40 @@ def tag_constant_data(edge_program: ExportedProgram) -> None:
                     node.meta["delegation_tag"] = user_tags.pop()
 
 
+def tag_mutated_buffer(edge_program: ExportedProgram) -> None:
+    """
+    Util function for partitioners. This function tags the mutated buffer nodes
+    whose users all belong within the same partition. This should be called after tagging all other nodes.
+    Any buffer which is used as input to a subgraph, will be tagged with the same tag as that
+    subgraph. Throw error when buffers is used across different partitions. That is the
+    underlying data will be owned by multiple delegates.
+    """
+    for node in edge_program.graph.nodes:
+        # Determine whether this node is a mutated buffer
+        is_mutated_buffer_node = False
+        if node.op == "placeholder" and is_buffer(edge_program, node):
+            for node_user in node.users:
+                if node_user.name in edge_program.graph_signature.buffers_to_mutate:
+                    is_mutated_buffer_node = True
+                    break
+        # This node is mutated buffer, tag it
+        if is_mutated_buffer_node:
+            user_tags = set()
+            for user in node.users:
+                user_tag = user.meta.get("delegation_tag", None)
+                if user_tag is not None:
+                    user_tags.add(user_tag)
+            if len(user_tags) > 1:
+                logging.info(
+                    f"The data node is used across multiple partitions, including {user_tags}. "
+                    "If the data is too large and it's not preferred to copy, please tag the "
+                    "constant node like node.['no_copy'] = True and they won't be copied."
+                )
+            # tag the data node with the same tag as the last user
+            if len(user_tags) > 0:
+                node.meta["delegation_tag"] = user_tags.pop()
+
+
 # TODO - style: use templated types
 class DelegateMappingBuilder:
     """
@@ -499,3 +530,31 @@ def insert_delegate_mapping_entry(
         # pyre-ignore Warning from Union[int, st] keys
         self._debug_handle_map[identifier] = filtered_debug_handles
         return identifier
+
+
+class WhyNoPartition:
+    """
+    Simple helper class for partitioners to log why a node was not lowered.
+
+    Example usage:
+
+        # In your backend partitioner file(s)
+        why = WhyNoPartition(logger=your_backend_logger)
+
+        # hypothetical function that checks if a node can be lowered
+        if not can_be_lowered(node):
+            why(node, "This node was not lowered because ...")
+    """
+
+    def __init__(self, logger: logging.Logger):
+        self.logger = logger
+        self.node: Optional[torch.fx.Node] = None
+        self.reason: str = ""
+
+    def __call__(self, node: torch.fx.Node, reason: str) -> None:
+        self.node = node
+        self.reason = reason
+        self.logger.debug(self)
+
+    def __str__(self) -> str:
+        return f"WhyNoPartition: Node {self.node} was not partitioned because {self.reason}."
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
index d959f10403d..24865e7a841 100644
--- a/exir/capture/_config.py
+++ b/exir/capture/_config.py
@@ -5,14 +5,15 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
-
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
 
+import torch
+
 from executorch.exir.dynamic_shape import DynamicMemoryPlanningMode
 from executorch.exir.pass_manager import PassType
 from executorch.exir.passes import MemoryPlanningPass, ToOutVarPass
-from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
+from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 from executorch.exir.tracer import ExirDynamoConfig
 from torch.fx._compatibility import compatibility
 
@@ -38,6 +39,10 @@ class EdgeCompileConfig:
     _check_ir_validity: bool = True
     # TODO(larryliu): remove this
     _use_edge_ops: bool = True
+    # Allow core ATen ops check to be skipped for certain ops, but continue with the rest of the checks.
+    _core_aten_ops_exception_list: List[torch._ops.OpOverload] = field(
+        default_factory=list
+    )
     _skip_type_promotion: bool = False
     # TODO(gasoonjia): remove this
     # TODO(T192537614): reenanle dim order as default
@@ -51,9 +56,7 @@ class ExecutorchBackendConfig:
 
     # A single memory planning pass can be defined for all the programs in the
     # EdgeProgramManager or can be defined per program.
-    memory_planning_pass: Union[PassType, Dict[str, PassType]] = MemoryPlanningPass(
-        "greedy"
-    )
+    memory_planning_pass: Union[PassType, Dict[str, PassType]] = MemoryPlanningPass()
     to_out_var_pass: PassType = ToOutVarPass(ignore_to_out_var_failure=False)
     dynamic_memory_planning_mode: DynamicMemoryPlanningMode = (
         DynamicMemoryPlanningMode.UPPER_BOUND
@@ -65,15 +68,9 @@ class ExecutorchBackendConfig:
     # This makes it possible to free those blobs at runtime.
     extract_delegate_segments: bool = True
 
-    # Whether to extract constants from the Program into separate segments,
-    # rather than encoding those constants in the flatbuffer data.
-    # This reduces the memory overhead of creating the .pte file for models with
-    # large constant data.
-    extract_constant_segment: bool = True
-
     # When extracting segments, the starting offset of each segment will be
     # aligned to this value (in bytes). Must be a power of two.
-    segment_alignment: int = 4096
+    segment_alignment: int = 128
 
     # If provided, the minimum alignment of tensor buffers in the program. Must
     # be a power of 2. If not provided, uses the value in the schema file.
@@ -82,7 +79,12 @@ class ExecutorchBackendConfig:
     # If provided, the minimum alignment of delegate data in the program. Must
     # be a power of 2. If not provided, uses the value in the schema file.
     delegate_alignment: Optional[int] = None
-    sym_shape_eval_pass: PassType = HintBasedSymShapeEvalPass()
+
+    # A single sym shape eval pass can be defined for all the programs in the
+    # EdgeProgramManager or can be defined per program.
+    sym_shape_eval_pass: Union[PassType, Dict[str, PassType]] = (
+        ConstraintBasedSymShapeEvalPass()
+    )
 
     # If set to true, view_copy operations will be converted to lightweight
     # view operations in the ET runtime
diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py
index 0aebab649e5..9c8c9dfd067 100644
--- a/exir/emit/_emit_program.py
+++ b/exir/emit/_emit_program.py
@@ -78,6 +78,35 @@ def _remove_non_user_outputs(exported_program: ExportedProgram) -> torch.fx.Grap
     return gm
 
 
+# For each entry point in the model, determine if its a joint graph,
+# and if it is return a map of the indices in the model output that the
+# gradient outputs start at and that the parameter outputs start at.
+def _get_training_metadata(methods: Dict[str, ExportedProgram]) -> Dict[str, int]:
+    gradients_method_prefix = "__et_training_gradients_index_"
+    parameters_method_prefix = "__et_training_parameters_index_"
+    fqn_method_prefix = "__et_training_fqn_"
+    training_metadata = {}
+    for name, method in methods.items():
+        found_grad = False
+        found_param = False
+        fqns = []
+        i = 0
+        for output_spec in method.graph_signature.output_specs:
+            if output_spec.kind == OutputKind.GRADIENT_TO_PARAMETER:
+                if not found_grad:
+                    training_metadata[gradients_method_prefix + name] = i
+                    found_grad = True
+                fqns.append(output_spec.target)
+            elif output_spec.kind == OutputKind.TOKEN and not found_param:
+                assert found_grad  # Params must come after gradients
+                training_metadata[parameters_method_prefix + name] = i
+                found_param = True
+            i += 1
+            if len(fqns) > 0:
+                training_metadata[fqn_method_prefix + name] = fqns
+    return training_metadata
+
+
 def emit_program(
     methods: Union[ExportedProgram, Dict[str, ExportedProgram]],
     emit_stacktrace: bool = False,
@@ -143,6 +172,10 @@ def emit_program(
             emitter.instr_id_to_delegate_debug_id_map
         )
 
+    training_metadata = _get_training_metadata(methods)
+    if len(training_metadata) > 0:
+        plans.extend(emitter._emit_prim_getters(training_metadata))
+
     # emit any primitive getters
     if prim_getters is not None:
         plans.extend(emitter._emit_prim_getters(prim_getters))
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
index f51b4113c8c..dea9cf6fd6a 100644
--- a/exir/emit/_emitter.py
+++ b/exir/emit/_emitter.py
@@ -79,6 +79,7 @@
     TensorShapeDynamism,
 )
 from executorch.exir.tensor import (
+    AddressSpaceOverflowException,
     layout_enum,
     make_allocation_info,
     make_tensor_value,
@@ -349,7 +350,20 @@ def _tensor_spec_to_evalue(self, spec: TensorSpec) -> EValue:
                 self.node,
                 f"Non-const tensor should be an activation tensor: mem_offset {spec.mem_offset}",
             )
-            allocation_info = make_allocation_info(spec.mem_id, spec.mem_offset)
+            try:
+                allocation_info = make_allocation_info(spec.mem_id, spec.mem_offset)
+            except AddressSpaceOverflowException as e:
+                raise InternalError(
+                    self._emit_node_specific_error(
+                        self.node,
+                        (
+                            f"{e}\nHint: If you are using a memory pass based on dynamic shape bounds, "
+                            f"such as ConstraintBasedSymShapeEvalPass, this may be the cause of an "
+                            f"unbacked SymInt with its upper bound lazily set to 2^64-1 (uint64 max) "
+                            "during torch.export()."
+                        ),
+                    )
+                )
 
         if spec.const:
             # Tensor with a blob we need to serialize. May not actually be constant at runtime
@@ -1270,7 +1284,7 @@ def _emit_prim_getters(self, prim_getters: Dict[str, Any]) -> List[ExecutionPlan
 
     def fetch_attr(self, target: _Target) -> _AbstractValue:
         """Fetch weights and other module parameters. If the attribute is a tensor, emit it."""
-        attr = super().fetch_attr(target)
+        attr = super().fetch_attr(target)  # pyre-fixme[6]
 
         if isinstance(attr, torch.Tensor):
             return self._emit_evalue(
@@ -1286,7 +1300,7 @@ def fetch_attr(self, target: _Target) -> _AbstractValue:
         else:
             return attr
 
-    def call_module(
+    def call_module(  # pyre-fixme[14]
         self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument]
     ) -> None:
         """Unsupported in execution IR, so unhandled by the emitter."""
@@ -1294,7 +1308,7 @@ def call_module(
             self._emit_node_specific_error(self.node, "call_module is not supported")
         )
 
-    def call_method(
+    def call_method(  # pyre-fixme[14]
         self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument]
     ) -> _EmitterValue:
         """Unsupported in execution IR, so unhandled by the emitter."""
@@ -1302,7 +1316,7 @@ def call_method(
             self._emit_node_specific_error(self.node, "call_method is not supported")
         )
 
-    def placeholder(
+    def placeholder(  # pyre-fixme[14]
         self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument]
     ) -> _AbstractValue:
         """Performs actions for the placeholder node of a graph module.
@@ -1324,7 +1338,7 @@ def placeholder(
         self.placeholder_count += 1
         return value
 
-    def output(
+    def output(  # pyre-fixme[14]
         self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument]
     ) -> None:
         """Performs actions for the output node of a graph module.
@@ -1354,7 +1368,7 @@ def output(
                     )
                     self.chain.instructions.append(instruction)
 
-    def call_function(
+    def call_function(  # pyre-fixme[14]
         self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument]
     ) -> _EmitterValue:
         """Performs actions for the call_function node of a graph module.
@@ -1412,7 +1426,7 @@ def call_function(
                 )
             )
 
-    def run(
+    def run(  # pyre-fixme[14]
         self,
         *args: _Argument,
         initial_env: Optional[Dict[torch.fx.Node, _Argument]] = None,
@@ -1527,7 +1541,6 @@ def placeholder(
         is_user_input = True
 
         if isinstance(target, str) and isinstance(spec, TensorSpec):
-
             fqn, is_mutable_buffer = self._find_fqn_for_placeholder(target, spec)
 
             # From the fqn find the corresponding tensor
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index f1b980a9aea..2feeefc4ef9 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -23,6 +23,7 @@
     ExecutorchProgramManager,
     to_edge,
 )
+from executorch.exir._serialize._program import deserialize_pte_binary
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -35,6 +36,7 @@
 from executorch.exir.schema import (
     Bool,
     DelegateCall,
+    Double,
     EValue,
     ExecutionPlan,
     Int,
@@ -1143,7 +1145,6 @@ def forward(self, k: torch.Tensor) -> torch.Tensor:
         config = exir.ExecutorchBackendConfig(
             sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
             memory_planning_pass=MemoryPlanningPass(
-                memory_planning_algo="greedy",
                 # allow_lifetime_and_storage_overlap: bool = False,
                 alloc_graph_input=True,
                 alloc_graph_output=False,
@@ -1604,9 +1605,7 @@ def forward(self, x):
         )
         model = model.to_executorch(
             config=ExecutorchBackendConfig(
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
                 sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
             )
         )
@@ -1620,3 +1619,53 @@ def forward(self, x):
         executorch_module = _load_for_executorch_from_buffer(model.buffer)
         self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1))
         self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1) + 1)
+
+    def test_infinity_in_model(self) -> None:
+        class InfinityMaskModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mask = torch.tensor([[1, 0], [0, 1]], dtype=torch.float32)
+
+            def forward(self, x):
+                masked_weights = x.masked_fill(self.mask == 0, float("-inf"))
+                return masked_weights
+
+        model = to_edge(
+            export(
+                InfinityMaskModel(),
+                (torch.randn(2, 2),),
+            )
+        )
+
+        # Confirm that we can serialize the model with infinity in it.
+        model = model.to_executorch()
+
+        # Assert that the infinity is stored as a string "-inf".
+        values = model.executorch_program.execution_plan[0].values
+        self.assertEqual(values[5].val, Double(double_val=float("-inf")))
+
+        # Confirm that we can also deserialize the model with infinity in it.
+        pte_data = deserialize_pte_binary(model.buffer)
+        self.assertEqual(
+            pte_data.execution_plan, model.executorch_program.execution_plan
+        )
+
+    def test_mutate_input_tensor(self) -> None:
+        class MutateInputTensorModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x.add_(1)
+
+        model = to_edge(
+            export(MutateInputTensorModule(), (torch.zeros(1),))
+        ).to_executorch(
+            config=ExecutorchBackendConfig(
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False)
+            )
+        )
+        executorch_model = _load_for_executorch_from_buffer(model.buffer)
+        input = torch.zeros(1)
+        executorch_model(input)
+        self.assertEqual(input, torch.ones(1))
diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py
index 2c2cd8eb0dd..bc42bba9a26 100644
--- a/exir/lowered_backend_module.py
+++ b/exir/lowered_backend_module.py
@@ -8,6 +8,7 @@
 
 import copy
 import operator
+from collections import defaultdict
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import torch
@@ -136,10 +137,10 @@ def original_module(self) -> ExportedProgram:
     def buffer(
         self,
         extract_delegate_segments: bool = False,
-        segment_alignment: int = 4096,
+        segment_alignment: int = 128,
         constant_tensor_alignment: Optional[int] = None,
         delegate_alignment: Optional[int] = None,
-        memory_planning: MemoryPlanningPass = None,
+        memory_planning: MemoryPlanningPass = None,  # pyre-fixme[9]
     ) -> bytes:
         """
         Returns a buffer containing the serialized ExecuTorch binary.
@@ -161,7 +162,7 @@ def buffer(
     def program(
         self,
         emit_stacktrace: bool = False,
-        memory_planning: MemoryPlanningPass = None,
+        memory_planning: MemoryPlanningPass = None,  # pyre-fixme[9]
     ) -> Program:
         # Fix autodpes introuces cyclic dependencies:
         # program -> verifier -> lowered_backend_module -> program
@@ -325,7 +326,7 @@ def program(
             verifiers=[lowered_exported_program.verifier],
         )
         if memory_planning is None:
-            memory_planning = MemoryPlanningPass("greedy")
+            memory_planning = MemoryPlanningPass()
         exported_program = _transform(exported_program, SpecPropPass(), memory_planning)
         emitted_program = emit_program(
             exported_program, emit_stacktrace=emit_stacktrace
@@ -488,8 +489,12 @@ def _get_new_signature(  # noqa: C901
         else {}
     )
 
+    toplevel_output_node_to_sig: Dict[str, List[OutputSpec]] = defaultdict(list)
+    if not is_submodule:
+        for output_spec in old_signature.output_specs:
+            toplevel_output_node_to_sig[output_spec.arg.name].append(output_spec)
+
     for node in gm.graph.nodes:
-        is_tagged = tag is None or node.meta.get("delegation_tag", None) == tag
         if node.op == "placeholder":
 
             if node.name not in input_node_to_sig:
@@ -507,7 +512,7 @@ def _get_new_signature(  # noqa: C901
             if not isinstance(orig_input_spec.arg, TensorArgument):
                 input_specs.append(orig_input_spec)
 
-            elif is_tagged:
+            elif node.meta.get("delegation_tag", None) == tag:
                 input_specs.append(orig_input_spec)
 
                 if orig_input_spec.kind == InputKind.USER_INPUT:
@@ -551,11 +556,72 @@ def _get_new_signature(  # noqa: C901
                 )
 
         if node.op == "output":
-            output_nodes = pytree.tree_leaves((node.args, node.kwargs))
+            buffer_mutation_idxs: Dict[int, List[OutputSpec]] = defaultdict(list)
+            for user in call_module_node.users.keys():
+                if user.name in toplevel_output_node_to_sig:
+                    assert (
+                        user.op == "call_function" and user.target == operator.getitem
+                    ), f"Invalid user {user}, node.op is {user.op} and node.target is {user.target}"
+                    getitem_idx = user.args[1]
+                    assert isinstance(
+                        getitem_idx, int
+                    ), f"Invalid getitem type: {type(getitem_idx)}"
+                    buffer_mutation_idxs[getitem_idx].extend(
+                        toplevel_output_node_to_sig[user.name]
+                    )
 
-            for output_node in output_nodes:
+            for i, output_node in enumerate(node.args[0]):
+                if i in buffer_mutation_idxs:
+                    assert isinstance(output_node, torch.fx.Node)
+                    orig_output_specs = buffer_mutation_idxs[i]
+
+                    if any(
+                        orig_output_spec.kind == OutputKind.BUFFER_MUTATION
+                        and orig_output_spec.target in new_state_dict
+                        for orig_output_spec in orig_output_specs
+                    ):
+                        # If the delegate wants to consume the buffer, then the
+                        # delegate should also consume the buffer mutation
+                        # (output spec would be a BUFFER_MUTATION).  Otherwise
+                        # the delegate will just return the result of the
+                        # mutation as a USER_OUTPUT.
+
+                        orig_output_spec = [
+                            orig_output_spec
+                            for orig_output_spec in orig_output_specs
+                            if orig_output_spec.kind == OutputKind.BUFFER_MUTATION
+                            and orig_output_spec.target in new_state_dict
+                        ][0]
+
+                        assert len(orig_output_specs) == 1, (
+                            f"Constant {orig_output_spec.target} was tagged to be "
+                            "consumed by the buffer, and was found to also contain "
+                            "a buffer mutation. However this buffer mutation node "
+                            "was found to also be used as other types of outputs "
+                            "which is currently not supported. Please file an "
+                            "issue on Github. \n\n"
+                            f"The toplevel program: {original_program}\n"
+                        )
+                        output_specs.append(
+                            OutputSpec(
+                                kind=OutputKind.BUFFER_MUTATION,
+                                arg=TensorArgument(name=output_node.name),
+                                target=orig_output_spec.target,
+                            )
+                        )
+                        output_specs_to_delete[orig_output_spec.arg.name] = (
+                            orig_output_spec
+                        )
+                    else:
+                        output_specs.append(
+                            OutputSpec(
+                                kind=OutputKind.USER_OUTPUT,
+                                arg=TensorArgument(name=output_node.name),
+                                target=None,
+                            )
+                        )
 
-                if not isinstance(output_node, torch.fx.Node):
+                elif not isinstance(output_node, torch.fx.Node):
                     output_specs.append(
                         OutputSpec(
                             kind=OutputKind.USER_OUTPUT,
@@ -774,7 +840,7 @@ def get_lowered_backend_modules(
     return lowered_programs
 
 
-def _unsafe_adjust_original_program(
+def _unsafe_adjust_original_program(  # noqa: C901
     original_program: ExportedProgram,
     call_delegate_node: torch.fx.Node,
     input_specs_to_delete: Dict[str, InputSpec],
@@ -830,3 +896,50 @@ def _unsafe_adjust_original_program(
             del original_program._constants[input_spec.target]
         else:
             raise RuntimeError(f"Invalid input spec {input_spec} received")
+
+    # Delete buffer mutations from the output which were consumed by the delegate
+    toplevel_output_node = None
+    for node in reversed(original_program.graph.nodes):
+        if node.op == "output":
+            toplevel_output_node = node
+            break
+
+    assert toplevel_output_node is not None
+    assert (
+        len(toplevel_output_node.args) == 1
+    ), f"Invalid output node: {toplevel_output_node} with args {toplevel_output_node.args}"
+
+    new_output_args = [
+        arg
+        for arg in toplevel_output_node.args[0]
+        if not isinstance(arg, torch.fx.Node) or arg.name not in output_specs_to_delete
+    ]
+    toplevel_output_node.args = (tuple(new_output_args),)
+
+    # Delete the buffer mutation getitem nodes
+    getitem_idxs: List[int] = []
+    user_nodes = list(call_delegate_node.users.keys())
+    for user in user_nodes:
+        if user.name in output_specs_to_delete:
+            assert (
+                user.op == "call_function" and user.target == operator.getitem
+            ), f"Invalid user {user}, node.op is {node.op} and node.target is {node.target}"
+            user_idx = user.args[1]
+            assert isinstance(user_idx, int), f"Invalid getitem type: {type(user_idx)}"
+            getitem_idxs.append(user_idx)
+            original_program.graph.erase_node(user)
+
+    getitem_idxs.sort(reverse=True)
+
+    # Adjust all the getitem indices after the deleted getitems
+    user_nodes = list(call_delegate_node.users.keys())
+    for user in user_nodes:
+        assert user.op == "call_function" and user.target == operator.getitem
+        user_idx = user.args[1]
+        assert isinstance(user_idx, int)
+        for i, idx in enumerate(getitem_idxs):
+            if user_idx > idx:
+                user.args = (user.args[0], user_idx - (len(getitem_idxs) - i))
+                break
+
+    original_program._validate()
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
index 859bd069013..3c28639ba13 100644
--- a/exir/memory_planning.py
+++ b/exir/memory_planning.py
@@ -18,12 +18,7 @@
 from executorch.exir import memory
 from executorch.exir.control_flow import while_loop as exir_while
 from executorch.exir.delegate import executorch_call_delegate
-from executorch.exir.error import (
-    ExportError,
-    ExportErrorType,
-    internal_assert,
-    InternalError,
-)
+from executorch.exir.error import internal_assert, InternalError
 from executorch.exir.operator.convert import is_inplace_variant, is_out_variant
 from executorch.exir.schema import TensorShapeDynamism
 from executorch.exir.tensor import TensorSpec
@@ -255,17 +250,6 @@ def verify_graph_input_output(self) -> None:
             ), f"Misallocate graph output {graph_output_allocated} v.s. {self.alloc_graph_output}"
 
 
-def register_algo(fn: Callable[..., List[int]]) -> Callable[..., List[int]]:
-    algo_name = fn.__name__
-    if algo_name in REGISTERED_ALGOS:
-        raise ExportError(
-            ExportErrorType.VIOLATION_OF_SPEC,
-            f"Re-registering memory planning algorithm {algo_name}",
-        )
-    REGISTERED_ALGOS[algo_name] = fn
-    return fn
-
-
 def _is_out_var_node(node: torch.fx.Node) -> bool:
     return (
         node.op == "call_function"
@@ -561,7 +545,6 @@ def get_node_tensor_specs(
         ]
 
 
-@register_algo
 def greedy(
     graph_module: torch.fx.GraphModule,
     alignment: int,
@@ -615,7 +598,6 @@ def greedy(
     return total_sizes
 
 
-@register_algo
 def naive(
     graph_module: torch.fx.GraphModule,
     alignment: int,
@@ -656,15 +638,6 @@ def _allocate_buf(bufsizes: List[int], mem_id: int, allocated: int) -> int:
     return bufsizes
 
 
-def get_algo(algo_name: str) -> Callable[..., List[int]]:
-    if algo_name not in REGISTERED_ALGOS:
-        raise ExportError(
-            ExportErrorType.NOT_SUPPORTED,
-            f"Memory planning algorithm '{algo_name}' not found",
-        )
-    return REGISTERED_ALGOS[algo_name]
-
-
 def get_cond_nodes(graph_module: torch.fx.GraphModule) -> Iterable[Node]:
     for nd in graph_module.graph.nodes:
         if nd.target is torch.ops.higher_order.cond:
diff --git a/exir/pass_base.py b/exir/pass_base.py
index dd55641f257..3b1a2928e2c 100644
--- a/exir/pass_base.py
+++ b/exir/pass_base.py
@@ -177,7 +177,7 @@ def __init__(self, callback: "_ExportPassBase", codegen: CodeGen) -> None:
             self.fake_tensor_mode: Optional[FakeTensorMode] = None
             self.submodules: Dict[torch.nn.Module, str] = {}
 
-        def trace(self) -> None:
+        def trace(self) -> None:  # pyre-fixme[14,15]
             raise ExportPassBaseError("ExportTracer doesn't support trace().")
 
         def create_arg(self, a: Argument) -> torch.fx.Node:
@@ -290,7 +290,7 @@ def __init__(self, callback: "_ExportPassBase", gm: fx.GraphModule) -> None:
             self.callback = callback
             self.node: torch.fx.Node = next(iter(gm.graph.nodes))
 
-        def placeholder(
+        def placeholder(  # pyre-fixme[14]
             self,
             target: str,
             args: Tuple[Argument, ...],
@@ -351,7 +351,7 @@ def call_function(
             else:
                 raise ExportPassBaseError(f"Unsupported target type: {target}")
 
-        def get_attr(
+        def get_attr(  # pyre-fixme[14]
             self, target: str, args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
         ) -> Argument:
             return super().get_attr(target, args, kwargs)
@@ -364,7 +364,7 @@ def call_module(
         ) -> None:
             raise ExportPassBaseError("call_module is not supported.")
 
-        def call_method(
+        def call_method(  # pyre-fixme[14]
             self, target: str, args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
         ) -> None:
             raise ExportPassBaseError("call_method is not supported.")
diff --git a/exir/passes/TARGETS b/exir/passes/TARGETS
index 4e59af26eae..eeb1e5265b0 100644
--- a/exir/passes/TARGETS
+++ b/exir/passes/TARGETS
@@ -202,6 +202,7 @@ python_library(
     ],
     deps = [
         "//caffe2:torch",
+        "//executorch/exir:_warnings",
         "//executorch/exir:pass_base",
         "//executorch/exir:sym_util",
         "//executorch/exir:tensor",
diff --git a/exir/passes/__init__.py b/exir/passes/__init__.py
index 99507ccdc9d..7a0623040f8 100644
--- a/exir/passes/__init__.py
+++ b/exir/passes/__init__.py
@@ -302,6 +302,7 @@ def make_alloc_node(
                     "Memory allocator node needs FakeTensor val or TensorMetadata to proceed"
                 )
 
+    # pyre-fixme[6]
     alloc = graph_module.graph.call_function(memory.alloc, (alloc_spec,))
     alloc.meta["val"] = val
     alloc.meta["tensor_meta"] = tensor_meta
diff --git a/exir/passes/executorch_prim_ops_registry.py b/exir/passes/executorch_prim_ops_registry.py
index 5159f630fe7..6362a471121 100644
--- a/exir/passes/executorch_prim_ops_registry.py
+++ b/exir/passes/executorch_prim_ops_registry.py
@@ -86,6 +86,11 @@ def mod(a: SymInt, b: SymInt) -> SymInt:
     return SymInt(int(a) % int(b))
 
 
+@bind_pattern_to_op(executorch_prims_lib, "neg.Scalar(Scalar a) -> Scalar")
+def neg(a: _SymScalar) -> _SymScalar:
+    return -a  # pyre-ignore
+
+
 _PYTHON_SYM_OPS_TO_EXECUTORCH_SYM_OPS: Dict[OpOverload, OpOverload] = {
     operator.sub: ops.backend.executorch_prim.sub.Scalar,
     operator.mul: ops.backend.executorch_prim.mul.Scalar,
@@ -98,6 +103,7 @@ def mod(a: SymInt, b: SymInt) -> SymInt:
     operator.ge: ops.backend.executorch_prim.ge.Scalar,
     operator.le: ops.backend.executorch_prim.le.Scalar,
     operator.mod: ops.backend.executorch_prim.mod.Scalar,
+    operator.neg: ops.backend.executorch_prim.neg.Scalar,
     torch.sym_float: ops.backend.executorch_prim.sym_float.Scalar,
 }
 
diff --git a/exir/passes/insert_write_back_for_buffers_pass.py b/exir/passes/insert_write_back_for_buffers_pass.py
index 7aef3571910..1ddbf98e7ee 100644
--- a/exir/passes/insert_write_back_for_buffers_pass.py
+++ b/exir/passes/insert_write_back_for_buffers_pass.py
@@ -15,6 +15,7 @@
     OutputKind,
     OutputSpec,
 )
+from torch.export.graph_signature import TensorArgument
 from torch.utils import _pytree as pytree
 
 
@@ -73,20 +74,21 @@ def insert_write_back_for_buffers_pass(
     ep: ExportedProgram,
 ) -> Tuple[torch.fx.GraphModule, ExportGraphSignature]:
     gm: torch.fx.GraphModule = ep.graph_module
-    lifted_inputs: List[Optional[str]] = [
-        (
-            in_spec.target
-            if in_spec.kind
-            in (
-                InputKind.BUFFER,
-                InputKind.CONSTANT_TENSOR,
-                InputKind.PARAMETER,
-                InputKind.CUSTOM_OBJ,
-            )
-            else None
-        )
-        for in_spec in ep.graph_signature.input_specs
-    ]
+    lifted_inputs: List[Optional[str]] = []
+    for in_spec in ep.graph_signature.input_specs:
+        if in_spec.kind in (
+            InputKind.BUFFER,
+            InputKind.CONSTANT_TENSOR,
+            InputKind.PARAMETER,
+            InputKind.CUSTOM_OBJ,
+        ):
+            lifted_inputs.append(in_spec.target)
+        elif in_spec.kind is InputKind.USER_INPUT and isinstance(
+            in_spec.arg, TensorArgument
+        ):
+            lifted_inputs.append(in_spec.arg.name)
+        else:
+            lifted_inputs.append(None)
 
     input_name_to_node: Dict[str, torch.fx.Node] = {}
 
@@ -101,7 +103,8 @@ def insert_write_back_for_buffers_pass(
     mutated_outputs: List[Optional[str]] = [
         (
             out_spec.target
-            if out_spec.kind in (OutputKind.BUFFER_MUTATION,)
+            if out_spec.kind
+            in (OutputKind.BUFFER_MUTATION, OutputKind.USER_INPUT_MUTATION)
             and out_spec.arg.name
             not in {
                 val.name for val in input_name_to_node.values()
@@ -121,7 +124,10 @@ def insert_write_back_for_buffers_pass(
     new_output_specs: List[OutputSpec] = []
     i = 0
     for output_spec in ep.graph_signature.output_specs:
-        if output_spec.kind == OutputKind.BUFFER_MUTATION:
+        if output_spec.kind in (
+            OutputKind.BUFFER_MUTATION,
+            OutputKind.USER_INPUT_MUTATION,
+        ):
             output_spec.arg.name = buffer_output_nodes[i].name
             i += 1
         new_output_specs.append(output_spec)
diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py
index 9295cabcab6..112b8f5fc52 100644
--- a/exir/passes/memory_planning_pass.py
+++ b/exir/passes/memory_planning_pass.py
@@ -6,7 +6,7 @@
 
 import logging
 import warnings
-from typing import Optional
+from typing import Callable, List, Optional
 
 import torch
 from executorch.exir.error import internal_assert
@@ -14,8 +14,8 @@
 from executorch.exir.memory_planning import (
     _is_out_var_node,
     apply_algo,
-    get_algo,
     get_node_tensor_specs,
+    greedy,
     Verifier,
 )
 from executorch.exir.operator.convert import get_out_args_from_opoverload
@@ -27,7 +27,7 @@
 class MemoryPlanningPass(PassBase):
     def __init__(
         self,
-        memory_planning_algo: str = "greedy",
+        memory_planning_algo: Callable[..., List[int]] = greedy,
         allow_lifetime_and_storage_overlap: bool = False,
         alloc_graph_input: bool = True,
         alloc_graph_output: bool = True,
@@ -96,14 +96,13 @@ def run(
         memory_planning_algo
         """
         self._set_alloc_node_spec(graph_module)
-        algo = get_algo(self.memory_planning_algo)
         # TODO(shunting) if people have concern of adding a field to GraphModule
         # directly, we should define a GraphModule subclass that we can add our
         # customized fields. Using the graph_module object to convey information across
         # passes/stages is quite natural and avoid yet another 'context' data structure
         # to do the job.
         _ = apply_algo(
-            algo,
+            self.memory_planning_algo,
             graph_module,
             self.alignment,
             graph_signature,
@@ -125,7 +124,7 @@ def run(
                 self.allow_lifetime_and_storage_overlap
             )
             logging.debug(
-                f"The {self.memory_planning_algo} algorithm reuses storage for {num_reuse_pairs} pair of tensors"
+                f"The {getattr(self.memory_planning_algo, '__name__', repr(self.memory_planning_algo))} algorithm reuses storage for {num_reuse_pairs} pair of tensors"
             )
         verifier.verify_graph_input_output()
         return PassResult(graph_module, True)
diff --git a/exir/passes/remove_noop_pass.py b/exir/passes/remove_noop_pass.py
index c834ca92947..d9b99556636 100644
--- a/exir/passes/remove_noop_pass.py
+++ b/exir/passes/remove_noop_pass.py
@@ -40,7 +40,7 @@ def eliminate_dq_q(
                 qparams_q = list(user.args)[1:]
                 if qparams_dq != qparams_q:
                     continue
-                user.replace_all_uses_with(node.args[0])
+                user.replace_all_uses_with(node.args[0])  # pyre-fixme[6]
 
 
 class RemoveNoopPass(ExportPass):
diff --git a/exir/passes/sym_shape_eval_pass.py b/exir/passes/sym_shape_eval_pass.py
index f4d11ed8143..4ba554c6a17 100644
--- a/exir/passes/sym_shape_eval_pass.py
+++ b/exir/passes/sym_shape_eval_pass.py
@@ -10,6 +10,8 @@
 
 import torch
 import torch.utils._pytree as pytree
+
+from executorch.exir._warnings import deprecated
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassBase, PassResult
 from executorch.exir.sym_util import eval_expr, eval_shape, eval_upper_bound
@@ -164,8 +166,21 @@ def index_Tensor(args, kwargs) -> List[Optional[int]]:  # noqa: C901
     return out_sizes
 
 
+@deprecated(
+    "`HintBasedSymShapeEvalPass` is deprecated "
+    "and will be removed in a future version of ExecuTorch. "
+    "Please use `ConstraintBasedSymShapeEvalPass` instead.",
+    category=FutureWarning,
+)
 class HintBasedSymShapeEvalPass(PassBase):
     """
+
+    .. warning::
+
+        'HintBasedSymShapeEvalPass` is deprecated
+        and will be removed in a future version of ExecuTorch.
+        Please use `ConstraintBasedSymShapeEvalPass` instead.
+
     If we enable dynamic shape tracing, a tensor's shape may become a symbolic
     formula. We should convert those symbolic formula to concrete value for
     static/upperbound tensors so we can properly do memory planning for them.
@@ -181,7 +196,7 @@ class HintBasedSymShapeEvalPass(PassBase):
 
     Warning: if you're using torch.export with constrain API, this method doesn't respect the input constraints.
 
-    Not inherit from ExportPass since we simply need a way to iterate thru
+    Not inherited from ExportPass since we simply need a way to iterate thru
     every node's output. PassBase is easier for that purpose.
     """
 
@@ -245,7 +260,7 @@ class ConstraintBasedSymShapeEvalPass(PassBase):
     formula. We should convert those symbolic formula to concrete value for
     static/upperbound tensors so we can properly do memory planning for them.
 
-    Not inherit from ExportPass since we simply need a way to iterate thru
+    Not inherited from ExportPass since we simply need a way to iterate through
     every node's output. PassBase is easier for that purpose.
     """
 
diff --git a/exir/passes/weights_to_outputs_pass.py b/exir/passes/weights_to_outputs_pass.py
index 216830c2e6c..aaf0c0eb5dc 100644
--- a/exir/passes/weights_to_outputs_pass.py
+++ b/exir/passes/weights_to_outputs_pass.py
@@ -53,11 +53,13 @@ def weights_to_outputs_pass(
             break
     assert output_node is not None
 
-    # Get place holder nodes with gradients
+    # Get input nodes that are weights with an associated gradient
     placeholder_nodes = [
         node
         for node in gm.graph.nodes
-        if node.op == "placeholder" and node.target in inputs_to_params.keys()
+        if node.op == "placeholder"
+        and node.target in inputs_to_params.keys()
+        and inputs_to_params[node.target] in grad_targets
     ]
 
     # Flag these placeholder nodes as having a gradient attached so that memory planning will operate on them.
diff --git a/exir/program/TARGETS b/exir/program/TARGETS
index 730c9e93aed..fc73abf1ff7 100644
--- a/exir/program/TARGETS
+++ b/exir/program/TARGETS
@@ -22,6 +22,7 @@ python_library(
         "//caffe2:torch",
         "//executorch/exir:error",
         "//executorch/exir:graph_module",
+        "//executorch/exir:pass_base",
         "//executorch/exir:pass_manager",
         "//executorch/exir:print_program",
         "//executorch/exir:schema",
diff --git a/exir/program/_program.py b/exir/program/_program.py
index 9031ce39e66..144cd0d0e8e 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -9,13 +9,13 @@
 import copy
 import io
 import logging
-from typing import Any, Dict, List, Optional, Sequence, Set, TextIO, Union
+from typing import Any, Dict, List, Optional, Sequence, Set, TextIO, Tuple, Union
 
 import torch
 import torch._export
-
 from executorch.exir._serialize import _serialize_pte_binary
 from executorch.exir._serialize._cord import Cord
+from executorch.exir._warnings import experimental
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.partitioner import Partitioner
 from executorch.exir.capture._config import EdgeCompileConfig, ExecutorchBackendConfig
@@ -23,6 +23,7 @@
 from executorch.exir.emit._emitter import _DelegateDebugIdentifierMap
 from executorch.exir.error import ExportError
 from executorch.exir.graph_module import get_control_flow_submodules
+from executorch.exir.pass_base import PassBase
 from executorch.exir.pass_manager import PassType
 from executorch.exir.passes import (
     base_post_op_replace_passes,
@@ -439,7 +440,6 @@ def to_executorch(
             new_prog,
             emit_stacktrace=config.emit_stacktrace,
             extract_delegate_segments=config.extract_delegate_segments,
-            extract_constant_segment=config.extract_constant_segment,
             segment_alignment=config.segment_alignment,
             constant_tensor_alignment=config.constant_tensor_alignment,
             delegate_alignment=config.delegate_alignment,
@@ -468,7 +468,6 @@ def __init__(
         exir_exported_program: ExirExportedProgram,
         emit_stacktrace: bool,
         extract_delegate_segments: bool,
-        extract_constant_segment: bool,
         segment_alignment: int,
         constant_tensor_alignment: Optional[int] = None,
         delegate_alignment: Optional[int] = None,
@@ -483,7 +482,6 @@ def __init__(
         self._emitter_output: Optional[EmitterOutput] = None
         self._emit_stacktrace: bool = emit_stacktrace
         self._extract_delegate_segments: bool = extract_delegate_segments
-        self._extract_constant_segment: bool = extract_constant_segment
         self._segment_alignment: int = segment_alignment
         self._constant_tensor_alignment: Optional[int] = constant_tensor_alignment
         self._delegate_alignment: Optional[int] = delegate_alignment
@@ -493,7 +491,6 @@ def _get_pte_data(self) -> Cord:
             self._pte_data = _serialize_pte_binary(
                 program=self.program,
                 extract_delegate_segments=self._extract_delegate_segments,
-                extract_constant_segment=self._extract_constant_segment,
                 segment_alignment=self._segment_alignment,
                 constant_tensor_alignment=self._constant_tensor_alignment,
                 delegate_alignment=self._delegate_alignment,
@@ -577,6 +574,9 @@ def _to_edge(ep, config: EdgeCompileConfig) -> "ExirExportedProgram":
             EXIRATenDialectVerifier()(ep.exported_program.graph_module)
         except ExportError:
             logging.info(
+                "If a particular operator failed core ATen IR check, please consider adding it to the exception list. "
+                "Add the operator to _core_aten_ops_exception_list in EdgeCompileConfig. This is the recommended way "
+                "to resolve this type of failure, so that the rest of the IR validation check can still be performed.\n"
                 "If you'd like to disable IR validation checking, please set _check_ir_validity in EdgeCompileConfig, "
                 "like *.to_edge(exir.EdgeCompileConfig(_check_ir_validity=False))."
             )
@@ -594,7 +594,11 @@ def _to_edge(ep, config: EdgeCompileConfig) -> "ExirExportedProgram":
                 module_call_graph=ep.exported_program.module_call_graph,
                 example_inputs=ep.exported_program.example_inputs,
                 constants=ep.exported_program.constants,
-                verifiers=[get_aten_verifier(enable=config._check_ir_validity)],
+                verifiers=[
+                    get_aten_verifier(
+                        config=config,
+                    )
+                ],
             ),
             False,
         )
@@ -641,25 +645,48 @@ def _to_edge(ep, config: EdgeCompileConfig) -> "ExirExportedProgram":
     return new_ep
 
 
-def pre_memory_planning_passes(config: ExecutorchBackendConfig) -> List[PassType]:
+def pre_memory_planning_passes(
+    config: ExecutorchBackendConfig, name: Optional[str] = None
+) -> List[PassType]:
+    """
+    Returns a list of passes to run before memory planning.
+    Get the sym shape eval pass based on the method name, if the pass is not in the dict, use the default pass.
+    """
+    # Handle symbolic shape eval pass
+    if isinstance(config.sym_shape_eval_pass, dict):
+        default_pass = ExecutorchBackendConfig().sym_shape_eval_pass
+        if not name:
+            sym_shape_eval_pass = default_pass
+        # pyre-ignore: Undefined attribute [16]
+        sym_shape_eval_pass = config.sym_shape_eval_pass.get(name, default_pass)
+    elif isinstance(config.sym_shape_eval_pass, PassBase):
+        sym_shape_eval_pass = config.sym_shape_eval_pass
+    else:
+        raise RuntimeError(
+            f"sym_shape_eval_pass must be a dict or a PassBase, got {config.sym_shape_eval_pass}"
+        )
     if config.remove_view_copy:
-        # pyre-ignore
         return [
             NormalizeViewCopyBasePass(),
             dead_code_elimination_pass,
             ReplaceViewCopyWithViewPass(),
-            config.sym_shape_eval_pass,
+            sym_shape_eval_pass,
             config.to_out_var_pass,
         ]
     else:
-        # pyre-ignore
         return [
-            config.sym_shape_eval_pass,
+            sym_shape_eval_pass,
             config.to_out_var_pass,
         ]
 
 
-def edge_to_executorch_passes(config: ExecutorchBackendConfig) -> List[PassType]:
+def edge_to_executorch_passes(
+    config: ExecutorchBackendConfig, name: Optional[str] = None
+) -> List[PassType]:
+    """
+    Returns a list of passes to lower from edge to executorch.
+    Get the pre memory planning passes based on the method name, if the pass is not in the dict, use the default pass.
+    """
     passes: List[PassType] = [
         *config.passes,
         SpecPropPass(),
@@ -668,7 +695,7 @@ def edge_to_executorch_passes(config: ExecutorchBackendConfig) -> List[PassType]
         # there exists an unbacked symint operation.
         EdgeToBackendOpsPass(),
         RemoveGraphAssertsPass(),
-    ] + pre_memory_planning_passes(config)
+    ] + pre_memory_planning_passes(config, name)
 
     return passes
 
@@ -679,10 +706,13 @@ def _generate_edge_program(
     program: ExportedProgram,
     ops_set_to_not_decompose: Optional[List[torch._ops.OpOverload]] = None,
 ) -> ExportedProgram:
-
     if config._check_ir_validity:
         try:
-            EXIRATenDialectVerifier(ops_set_to_not_decompose)(program.graph_module)
+            EXIRATenDialectVerifier(
+                edge_compile_config=config,
+                class_only=False,
+                exception_list=ops_set_to_not_decompose,
+            )(program.graph_module)
         except ExportError as e:
             logging.info(f"Input program {name} is not in ATen dialect.")
             raise e
@@ -1001,13 +1031,8 @@ def to_edge_transform_and_lower(
                 edge_manager = edge_manager.to_backend({name: curr_partitioner})
 
     for name, program in edge_manager._edge_programs.items():
-        if config._check_ir_validity:
-            EXIREdgeDialectVerifier(
-                edge_compile_config=config,
-                class_only=True,
-            )()(program.graph_module)
 
-        ops_set_to_not_decompose = set()
+        ops_set_to_not_decompose: Set[torch._ops.OpOverload] = set()
         partitioners = partitioner.get(name, [])
         for curr_partitioner in partitioners:
             curr_op_set, check_op_support = curr_partitioner.ops_to_not_decompose(
@@ -1023,9 +1048,64 @@ def to_edge_transform_and_lower(
                 generate_error=True,
             )
 
+        if config._check_ir_validity:
+            EXIREdgeDialectVerifier(
+                edge_compile_config=config,
+                class_only=True,
+                exception_list=list(ops_set_to_not_decompose),
+            )()(program.graph_module)
+
     return edge_manager
 
 
+@experimental(
+    """
+    This is an experimental API which overloads to_edge by preserving specified ops to not be decomposed. 
+    This function will be combined with to_edge in the future.
+    """
+)
+def to_edge_with_preserved_ops(
+    programs: Union[ExportedProgram, Dict[str, ExportedProgram]],
+    constant_methods: Optional[Dict[str, Any]] = None,
+    compile_config: Optional[EdgeCompileConfig] = None,
+    preserve_ops: Tuple[torch._ops.OpOverload, ...] = (),
+) -> "EdgeProgramManager":
+    """
+    :func:`to_edge` constructs an EdgeProgramManager from a set of exported programs in
+    ATen dialect. Upon construction those programs are transformed into edge dialect.
+
+    Args:
+        programs: Can be a single ExportedProgram or a dictionary mapping function names to their corresponding ExportedPrograms. If only a single ExportedProgram is provided it will be assigned the name "forward".
+        constant_methods: An optional dictionary of method name to the constant value returned by that method in eager mode. Often used to store config information on Edge models.
+        compile_config: An optional argument used to provide greater control over the transformation to edge dialect process.
+        preserve_ops: An argument used to specify ops that should not be decomposed.
+
+    Returns:
+        EdgeProgramManager
+    """
+    assert not isinstance(constant_methods, EdgeCompileConfig)
+    config = compile_config or EdgeCompileConfig()
+    if not isinstance(programs, dict):
+        aten_programs = {"forward": programs}
+    else:
+        aten_programs = programs
+
+    edge_programs: Dict[str, ExportedProgram] = {}
+
+    for name, program in aten_programs.items():
+        # Decompose to Core ATen
+        program = program.run_decompositions(
+            _default_decomposition_table(), _preserve_ops=preserve_ops
+        )
+        edge_programs[name] = _generate_edge_program(
+            name, config, program, list(preserve_ops)
+        )
+
+    return EdgeProgramManager(
+        edge_programs, constant_methods, config, list(preserve_ops)
+    )
+
+
 def to_edge(
     programs: Union[ExportedProgram, Dict[str, ExportedProgram]],
     constant_methods: Optional[Dict[str, Any]] = None,
@@ -1088,6 +1168,7 @@ def __init__(
         self.compile_config = compile_config or EdgeCompileConfig()
         if not isinstance(edge_programs, dict):
             edge_programs = {"forward": edge_programs}
+
         for name, program in edge_programs.items():
             try:
                 EXIREdgeDialectVerifier(
@@ -1234,7 +1315,7 @@ def to_executorch(
             program = unsafe_remove_auto_functionalized_pass(program)
             gm, new_signature = insert_write_back_for_buffers_pass(program)
             new_gm = program.graph_module
-            for p in edge_to_executorch_passes(config):
+            for p in edge_to_executorch_passes(config, name):
                 new_gm_res = p(new_gm)
                 assert new_gm_res is not None
                 new_gm = new_gm_res.graph_module
@@ -1328,7 +1409,6 @@ def __init__(
             program=self._emitter_output.program,
             mutable_data=self._emitter_output.mutable_data,
             extract_delegate_segments=backend_config.extract_delegate_segments,
-            extract_constant_segment=backend_config.extract_constant_segment,
             segment_alignment=backend_config.segment_alignment,
             constant_tensor_alignment=backend_config.constant_tensor_alignment,
             delegate_alignment=backend_config.delegate_alignment,
diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py
index 4d2f5dfd699..73eea7b93ef 100644
--- a/exir/program/test/test_program.py
+++ b/exir/program/test/test_program.py
@@ -26,6 +26,7 @@
     ExecutorchProgramManager,
     to_edge,
     to_edge_transform_and_lower,
+    to_edge_with_preserved_ops,
 )
 from executorch.exir.tracer import _default_decomposition_table
 from executorch.exir.verification.verifier import EXIREdgeDialectVerifier
@@ -249,12 +250,10 @@ def test_executorch_manager_multi_config(self):
         def get_executorch_memory_planning_passes() -> Dict[str, MemoryPlanningPass]:
             return {
                 "forward": MemoryPlanningPass(
-                    memory_planning_algo="greedy",
                     alloc_graph_input=True,
                     alloc_graph_output=False,
                 ),
                 "foo": MemoryPlanningPass(
-                    memory_planning_algo="greedy",
                     alloc_graph_input=False,
                     alloc_graph_output=True,
                 ),
@@ -531,11 +530,14 @@ def test_edge_manager_dialect(self):
         )
         self.assertTrue(edge_manager.exported_program().dialect == "EDGE")
 
-    def _test_edge_dialect_verifier(self, callable, validate_ir=True):
+    def _test_edge_dialect_verifier(
+        self, callable, validate_ir=True, exception_list=None
+    ):
         from executorch.exir import EdgeCompileConfig
 
         edge_compile_config = EdgeCompileConfig(
             _check_ir_validity=validate_ir,
+            _core_aten_ops_exception_list=exception_list,
         )
         # pre-autograd export. eventually this will become torch.export
         one = torch.ones(1, dtype=torch.float)
@@ -681,3 +683,121 @@ def count_nodes(graph_module, target):
             ),
             1,
         )
+
+    def test_edge_dialect_non_core_aten_ops(self):
+        class LinalgNorm(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.linalg.norm(x)
+
+        from torch._export.verifier import SpecViolationError
+
+        input = torch.arange(9, dtype=torch.float) - 4
+        ep = torch.export.export(LinalgNorm(), (input,))
+
+        # aten::linalg_norm is not a core op, so it should error out
+        with self.assertRaises(SpecViolationError):
+            _ = to_edge(ep, compile_config=EdgeCompileConfig(_check_ir_validity=True))
+
+        # with exception list, it should not error out
+        try:
+            # This should not raise error
+            _ = to_edge(
+                ep,
+                compile_config=EdgeCompileConfig(
+                    _check_ir_validity=True,
+                    _core_aten_ops_exception_list=[
+                        torch.ops.aten.linalg_vector_norm.default
+                    ],
+                ),
+            )
+        except SpecViolationError:
+            self.fail("Should not error out on linalg_vector_norm op")
+
+    def _test_to_edge_with_preserved_ops(
+        self, program, preserved_ops, expected_preserved_ops
+    ):
+        edge = to_edge_with_preserved_ops(program, preserve_ops=preserved_ops)
+
+        def count_nodes(graph_module, target):
+            count = 0
+            for node in graph_module.graph.nodes:
+                if node.op == "call_function" and node.target in target:
+                    count += 1
+            return count
+
+        aten_ops_non_decomposed = count_nodes(
+            program.graph_module,
+            preserved_ops,
+        )
+
+        edge_ops_non_decomposed = count_nodes(
+            edge.exported_program().graph_module,
+            expected_preserved_ops,
+        )
+
+        self.assertEqual(aten_ops_non_decomposed, edge_ops_non_decomposed)
+
+    def test_to_edge_with_single_preserved_op(self):
+        model = TestLinear()
+        program = torch.export.export(model, model._get_random_inputs())
+
+        ops_not_to_decompose = [
+            torch.ops.aten.linear.default,
+        ]
+        expected_non_decomposed_edge_ops = [
+            exir_ops.edge.aten.linear.default,
+        ]
+
+        self._test_to_edge_with_preserved_ops(
+            program, ops_not_to_decompose, expected_non_decomposed_edge_ops
+        )
+
+    def test_to_edge_with_partial_ops_preserved(self):
+        model = TestLinearSDPACombined()
+        program = torch.export.export(model, model._get_random_inputs())
+
+        ops_not_to_decompose = [
+            torch.ops.aten.linear.default,
+        ]
+        expected_non_decomposed_edge_ops = [
+            exir_ops.edge.aten.linear.default,
+        ]
+
+        self._test_to_edge_with_preserved_ops(
+            program, ops_not_to_decompose, expected_non_decomposed_edge_ops
+        )
+
+    def test_to_edge_with_multiple_ops_preserved(self):
+        model = TestLinearSDPACombined()
+        program = torch.export.export(model, model._get_random_inputs())
+
+        ops_not_to_decompose = [
+            torch.ops.aten.linear.default,
+            torch.ops.aten.scaled_dot_product_attention.default,
+        ]
+        expected_non_decomposed_edge_ops = [
+            exir_ops.edge.aten.linear.default,
+            exir_ops.edge.aten.scaled_dot_product_attention.default,
+        ]
+
+        self._test_to_edge_with_preserved_ops(
+            program, ops_not_to_decompose, expected_non_decomposed_edge_ops
+        )
+
+    def test_to_edge_with_preserved_ops_not_in_model(self):
+        model = TestSDPA()
+        program = torch.export.export(model, model._get_random_inputs())
+
+        ops_not_to_decompose = [
+            torch.ops.aten.linear.default,
+        ]
+        expected_non_decomposed_edge_ops = [
+            exir_ops.edge.aten.linear.default,
+        ]
+
+        self._test_to_edge_with_preserved_ops(
+            program, ops_not_to_decompose, expected_non_decomposed_edge_ops
+        )
diff --git a/exir/schema.py b/exir/schema.py
index 706bc611403..9436465459a 100644
--- a/exir/schema.py
+++ b/exir/schema.py
@@ -75,7 +75,23 @@ class Bool:
 
 @dataclass
 class Double:
-    double_val: float
+    double_val: Union[float, str]
+
+    def __init__(self, double_val: float) -> None:
+        if double_val == float("inf"):
+            self.double_val = "inf"
+        elif double_val == float("-inf"):
+            self.double_val = "-inf"
+        else:
+            self.double_val = double_val
+
+    def __post_init__(self) -> None:
+        if isinstance(self.double_val, str):
+            assert self.double_val in ["inf", "-inf"]
+        else:
+            assert isinstance(self.double_val, float)
+            assert not self.double_val == float("inf")
+            assert not self.double_val == float("-inf")
 
 
 @dataclass
diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py
index 9b2315c1aae..8ec864e489b 100644
--- a/exir/serde/export_serialize.py
+++ b/exir/serde/export_serialize.py
@@ -51,6 +51,7 @@
 from torch.fx.experimental import symbolic_shapes
 from torch.utils import _pytree as pytree
 from torch.utils._pytree import treespec_dumps, treespec_loads
+from torch.utils._sympy.numbers import int_oo
 from torch.utils._sympy.value_ranges import ValueRanges
 
 # pyre-ignore
@@ -332,9 +333,9 @@ def deserialize_torch_artifact(
 
 def _sympy_int_to_int(val: sympy.Expr, adjust: str):
     # Convert simple sympy Integers into concrete int
-    if val == sympy.oo:
+    if val in (sympy.oo, int_oo):
         return math.inf
-    if val == -sympy.oo:
+    if val in (-sympy.oo, -int_oo):
         return -math.inf
     if isinstance(val, sympy.Integer):
         return int(val)
@@ -360,9 +361,9 @@ def _sympy_int_to_int(val: sympy.Expr, adjust: str):
 def _int_to_sympy_int(val) -> sympy.Expr:
     # Convert concrete int into simple sympy Integers
     if val == math.inf:
-        return sympy.oo
+        return int_oo
     if val == -math.inf:
-        return -sympy.oo
+        return -int_oo
     return sympy.Integer(val)
 
 
@@ -1473,6 +1474,12 @@ def deserialize_sym_int(self, s: SymInt) -> Union[int, torch.SymInt]:
 
             if val.expr_str in self.symbol_name_to_symbol:
                 sym = self.symbol_name_to_symbol[val.expr_str]
+                if (
+                    isinstance(sym, sympy.Symbol)
+                    and sym not in self.shape_env.var_to_val
+                ):
+                    if hint is not None:
+                        self.shape_env.add_var_to_val(sym, hint)
             else:
                 sym = sympy.sympify(val.expr_str, locals=self.symbol_name_to_symbol)
                 # NOTE(avik): Assumptions on symbols are not explicitly serialized.
diff --git a/exir/tensor.py b/exir/tensor.py
index 7380a96ebc7..d63ed5d2627 100644
--- a/exir/tensor.py
+++ b/exir/tensor.py
@@ -22,6 +22,10 @@
 from executorch.exir.sym_util import eval_shape
 
 
+class AddressSpaceOverflowException(Exception):
+    pass
+
+
 def num_bytes_from_shape_and_dtype(shape: torch.Size, dtype: torch.dtype) -> int:
     """
     Assume the tensor is a contiguous one.
@@ -297,7 +301,9 @@ def make_allocation_info(mem_id: int, mem_offset: int) -> schema.AllocationDetai
     memory_offset_low = mem_offset & ((1 << 32) - 1)
     memory_offset_high = mem_offset >> 32
     if memory_offset_high >= 1 << 32:
-        raise ValueError(f"mem_offset {mem_offset} does not fit in 64 bits")
+        raise AddressSpaceOverflowException(
+            f"mem_offset {mem_offset} does not fit in 64 bits"
+        )
 
     allocation_info = schema.AllocationDetails(
         memory_id=mem_id,
diff --git a/exir/tests/test_joint_graph.py b/exir/tests/test_joint_graph.py
index 7c80439610b..2413e2b4980 100644
--- a/exir/tests/test_joint_graph.py
+++ b/exir/tests/test_joint_graph.py
@@ -26,10 +26,13 @@ class Module(torch.nn.Module):
             def __init__(self):
                 super().__init__()
                 self.linear = torch.nn.Linear(3, 3)
+                self.linear_no_train = torch.nn.Linear(3, 3)
+                for param in self.linear_no_train.parameters():
+                    param.requires_grad = False
                 self.loss = torch.nn.CrossEntropyLoss()
 
             def forward(self, x, y):
-                return self.loss(self.linear(x).softmax(dim=0), y)
+                return self.loss(self.linear_no_train(self.linear(x)).softmax(dim=0), y)
 
         m = Module()
         example_inputs = (torch.ones(3), torch.tensor([1.0, 0.0, 0.0]))
@@ -108,3 +111,30 @@ def forward(self, x, y):
         self.assertTrue(torch.allclose(m.linear.bias.grad, et_outputs[2]))
         self.assertTrue(torch.allclose(m.linear.weight, et_outputs[3]))
         self.assertTrue(torch.allclose(m.linear.bias, et_outputs[4]))
+
+        self.assertEqual(
+            len(et.executorch_program.execution_plan), 4
+        )  # forward + 2 training metadata functions
+
+        # gradient outputs start at index 1
+        self.assertEqual(
+            et.executorch_program.execution_plan[1]  # pyre-ignore
+            .values[0]
+            .val.int_val,
+            1,
+        )
+
+        self.assertEqual(
+            et.executorch_program.execution_plan[2]  # pyre-ignore
+            .values[0]
+            .val.string_val,
+            "linear.weight",
+        )
+
+        # parameter outputs start at index 3
+        self.assertEqual(
+            et.executorch_program.execution_plan[3]  # pyre-ignore
+            .values[0]
+            .val.int_val,
+            3,
+        )
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
index 12a0583ab41..ebea0acf0f4 100644
--- a/exir/tests/test_memory_planning.py
+++ b/exir/tests/test_memory_planning.py
@@ -17,6 +17,8 @@
 from executorch.exir.memory_planning import (
     filter_nodes,
     get_node_tensor_specs,
+    greedy,
+    naive,
     Verifier,
 )
 from executorch.exir.pass_base import PassResult
@@ -208,7 +210,7 @@ def forward(self, a: torch.Tensor) -> torch.Tensor:
 
 def maketest(
     module_cls: Type[torch.nn.Module],
-    criteria: Optional[List[Tuple[str, bool]]] = None,
+    criteria: Optional[List[Tuple[Callable[..., List[int]], bool]]] = None,
     extra_check: Optional[Callable[..., None]] = None,
     use_functionalization: bool = True,
     alloc_graph_input: bool = True,
@@ -222,13 +224,15 @@ def wrapper(self: "TestMemoryPlanning") -> None:
         if not criteria:
             criteria = [
                 # naive algorithm does not reuse tensor storages
-                ("naive", False),
+                (naive, False),
                 # greedy algorithm should reuse tensor storages in the testing model
-                ("greedy", True),
+                (greedy, True),
             ]
 
         for algo, expect_reuse in criteria:
-            print(f"algo {algo}, expect_reuse {expect_reuse}")
+            print(
+                f"algo {getattr(algo, '__name__', repr(algo))}, expect_reuse {expect_reuse}"
+            )
             eager_module = module_cls().eval()
             inputs = eager_module.get_random_inputs()
             graph_module = (
@@ -353,8 +357,8 @@ def verify_overlap_placeholders(
     test_return_two: Callable[..., None] = maketest(
         ModuleReturnTwo,
         criteria=[
-            ("naive", False),
-            ("greedy", True),
+            (naive, False),
+            (greedy, True),
         ],
     )
 
@@ -363,8 +367,8 @@ def verify_overlap_placeholders(
     test_list_arg: Callable[..., None] = maketest(
         ModuleListArg,
         criteria=[
-            ("naive", False),
-            ("greedy", True),
+            (naive, False),
+            (greedy, True),
         ],
         extra_check=ModuleListArg.extra_check,
     )
@@ -466,12 +470,12 @@ def quantize(self, eager_model: nn.Module) -> nn.Module:
     @parameterized.expand(
         [
             (
-                "naive",
+                naive,
                 [(1, 0), (3, 0), (1, 4), (3, 4), (1, 8)],
                 [0, 12, 0, 8],
             ),
             (
-                "greedy",
+                greedy,
                 [(1, 0), (3, 0), (1, 4), (3, 4), (1, 0)],
                 [0, 8, 0, 8],
             ),
@@ -479,7 +483,7 @@ def quantize(self, eager_model: nn.Module) -> nn.Module:
     )
     def test_multiple_pools(
         self,
-        algo: str,
+        algo: Callable[..., List[int]],
         expected_allocs: List[Tuple[int, int]],
         expected_bufsizes: List[int],
     ) -> None:
@@ -550,9 +554,7 @@ def count_planned_inputs(
 
         ep_no_input_planning = to_edge(export(model, inputs)).to_executorch(
             config=ExecutorchBackendConfig(
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
                 sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
             )
         )
@@ -572,9 +574,7 @@ def count_planned_inputs(
 
         ep_input_planning = to_edge(export(model, inputs)).to_executorch(
             config=ExecutorchBackendConfig(
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=True
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=True),
                 sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
             )
         )
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 99ec6481458..79578763475 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -713,7 +713,7 @@ def test_alloc_node_spec(self) -> None:
         self.assertIsNotNone(new_gm_res)
         new_gm = new_gm_res.graph_module
 
-        new_gm_res = MemoryPlanningPass("greedy")(new_gm)
+        new_gm_res = MemoryPlanningPass()(new_gm)
         self.assertIsNotNone(new_gm_res)
         new_gm = new_gm_res.graph_module
 
@@ -1421,7 +1421,7 @@ def quantize_model(
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config()
             quantizer.set_global(quantization_config)
-            m = prepare_pt2e(m, quantizer)
+            m = prepare_pt2e(m, quantizer)  # pyre-fixme[6]
             m = convert_pt2e(m, fold_quantize=True)
             ep = torch.export.export(m, example_inputs)
             dq_nodes_pre = count_dq_nodes(ep.graph_module)
diff --git a/exir/tests/test_quantization.py b/exir/tests/test_quantization.py
index ca85386db64..ebe94775221 100644
--- a/exir/tests/test_quantization.py
+++ b/exir/tests/test_quantization.py
@@ -58,7 +58,7 @@ def test_resnet(self) -> None:
             quantizer = XNNPACKQuantizer()
             operator_config = get_symmetric_quantization_config(is_per_channel=True)
             quantizer.set_global(operator_config)
-            m = prepare_pt2e(m, quantizer)
+            m = prepare_pt2e(m, quantizer)  # pyre-fixme[6]
             self.assertEqual(
                 id(m.activation_post_process_3), id(m.activation_post_process_2)
             )
diff --git a/exir/tests/test_remove_view_copy.py b/exir/tests/test_remove_view_copy.py
index f64a1f19981..0925a8abc89 100644
--- a/exir/tests/test_remove_view_copy.py
+++ b/exir/tests/test_remove_view_copy.py
@@ -48,9 +48,7 @@ def test_disable(self) -> None:
         etpm = to_edge(ep).to_executorch(
             config=ExecutorchBackendConfig(
                 remove_view_copy=False,
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             ),
         )
 
@@ -72,9 +70,7 @@ def test_output_matches(self) -> None:
         etpm_remove = epm_remove.to_executorch(
             config=ExecutorchBackendConfig(
                 remove_view_copy=True,
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             ),
         )
 
@@ -82,9 +78,7 @@ def test_output_matches(self) -> None:
         etpm_no_remove = epm_no_remove.to_executorch(
             config=ExecutorchBackendConfig(
                 remove_view_copy=True,
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             ),
         )
 
@@ -107,9 +101,7 @@ def test_spec(self) -> None:
         etpm = to_edge(ep).to_executorch(
             config=ExecutorchBackendConfig(
                 remove_view_copy=True,
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             ),
         )
 
diff --git a/exir/tests/test_tensor.py b/exir/tests/test_tensor.py
index a5d197a85b7..c5383b0dac2 100644
--- a/exir/tests/test_tensor.py
+++ b/exir/tests/test_tensor.py
@@ -171,7 +171,7 @@ def test_allocation_info_fails(self) -> None:
         )
         for test_case in test_cases:
             kwargs = test_case[0]
-            with self.assertRaisesRegex(ValueError, test_case[1], msg=f"{kwargs}"):
+            with self.assertRaisesRegex(Exception, test_case[1], msg=f"{kwargs}"):
                 make_allocation_info(**kwargs)
 
     def test_contiguous_stride_from_shape(self) -> None:
diff --git a/exir/tracer.py b/exir/tracer.py
index 1a8709a2372..c4593cca8e3 100644
--- a/exir/tracer.py
+++ b/exir/tracer.py
@@ -272,7 +272,7 @@ def __torch_function__(
             kwargs = {}
         if torch.is_inference_mode_enabled():
             if func is torch.nn.functional.layer_norm:
-                args, kwargs = normalize_function(func, args, kwargs)
+                args, kwargs = normalize_function(func, args, kwargs)  # pyre-fixme[23]
                 input, normalized_shape = args
                 normalized_shape = list(normalized_shape)
                 return cls.__torch_dispatch__(
@@ -470,13 +470,13 @@ def create_arg(self, a: Value) -> torch.fx.Node:  # noqa: C901
                 self.submodules[a] = name_submodule
             return self.create_node("get_attr", self.submodules[a], (), {})
 
-        return super().create_arg(a)
+        return super().create_arg(a)  # pyre-fixme[7]
 
     @staticmethod
     def get() -> "DispatchTracer":
         return TRACER
 
-    def trace(
+    def trace(  # pyre-fixme[14,15]
         self,
         root: Callable[..., Value],
         concrete_args: Tuple[Value, ...] = (),
diff --git a/exir/verification/arg_validator.py b/exir/verification/arg_validator.py
index 65ab146782c..c087944b12d 100644
--- a/exir/verification/arg_validator.py
+++ b/exir/verification/arg_validator.py
@@ -62,7 +62,7 @@ def _get_kernel_arg(self, schema_arg, schema_arg_idx, args, kwargs):
 
         return kernel_arg
 
-    def call_function(  # noqa: C901
+    def call_function(  # noqa: C901  # pyre-fixme[14]
         self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument]
     ) -> Any:
         """
@@ -73,7 +73,7 @@ def call_function(  # noqa: C901
         ):
             if isinstance(target, HigherOrderOperator):
                 raise RunHigherOrderOperatorError("Can't run delegate")
-            return super().call_function(target, args, kwargs)
+            return super().call_function(target, args, kwargs)  # pyre-fixme[6]
 
         # TODO(gasoonjia): Update Optional[torch.dtype] to a concrete class to support mixed dtypes in tensorlist.
         tensor_arg_types: Dict[str, Optional[torch.dtype]] = {}
@@ -126,4 +126,4 @@ def call_function(  # noqa: C901
         valid = target._schema.dtype_constraint.validate(tensor_arg_types)
         if not valid:
             self.violating_ops[target] = tensor_arg_types
-        return super().call_function(target, args, kwargs)
+        return super().call_function(target, args, kwargs)  # pyre-fixme[6]
diff --git a/exir/verification/verifier.py b/exir/verification/verifier.py
index 8b6ec91dd3b..b519e20393a 100644
--- a/exir/verification/verifier.py
+++ b/exir/verification/verifier.py
@@ -52,12 +52,6 @@ def _check_valid_dim_order_ops(op, use_dim_order) -> None:
 class EXIRATenDialectVerifierBase(Verifier):
     dialect = "OLD_EXIR_ATEN_DISABLED"
 
-    def __init__(
-        self, exception_list: Optional[List[torch._ops.OpOverload]] = None
-    ) -> None:
-        super().__init__()
-        self._exception_list = exception_list if exception_list else []
-
     def allowed_getattr_types(self) -> Tuple[Type[Any], ...]:
         return (
             torch.fx.GraphModule,
@@ -78,38 +72,68 @@ def __call__(self, *args, **kwargs):
             raise RuntimeError("")
 
 
-class EXIRATenDialectVerifier(EXIRATenDialectVerifierBase):
-    dialect = "OLD_EXIR_ATEN"
+def EXIRATenDialectVerifier(  # noqa: C901
+    edge_compile_config: Optional[EdgeCompileConfig] = None,
+    class_only: bool = False,
+    exception_list: Optional[List[torch._ops.OpOverload]] = None,
+):
+    """
+    Returns a verifier class that runs ATen dialect specific checks on the graph module.
+    """
+    # merge the exception list from edge_compile_config and exception_list
+    if edge_compile_config and edge_compile_config._core_aten_ops_exception_list:
+        exception_list = edge_compile_config._core_aten_ops_exception_list + (
+            exception_list or []
+        )
 
-    def _get_exception_list(self) -> List[torch._ops.OpOverload]:
-        exception_list = [
-            torch.ops.aten.mkldnn_rnn_layer.default,
-            torch.ops.aten._upsample_bilinear2d_aa.default,
-            torch.ops.aten.quantize_per_tensor.default,
-            torch.ops.aten.dequantize.self,
-            torch.ops.aten.max.default,  # TODO(T188268054)
-            torch.ops.aten.min.default,  # TODO(T188268054)
-            torch.ops.aten.full_like.default,  # TODO(T183507359)
-        ]
-        exception_list += self._exception_list
+    class _EXIRATenDialectVerifier(EXIRATenDialectVerifierBase):
+        dialect = "OLD_EXIR_ATEN"
 
-        return exception_list
+        def __init__(self) -> None:
+            super().__init__()
+            # Note: here we are using the exception list passed from EXIRATenDialectVerifier function!
+            self._exception_list = exception_list if exception_list else []
 
-    def check_valid_op(self, op):
-        if isinstance(op, OpOverload):
-            # TODO These special ops should be removable easily.
-            if op.namespace != "aten" or op in self._get_exception_list():
-                return
-            if torch.Tag.core not in op.tags and torch.Tag.view_copy not in op.tags:
-                # NOTE(qihan): whether view_copy operators are marked as canonical is still under
-                #            discussion.
-                raise SpecViolationError(
-                    f"Operator {op.__module__}.{op.__name__} is not Aten Canonical."
-                )
+        def _get_exception_list(self) -> List[torch._ops.OpOverload]:
+            exception_list = [
+                torch.ops.aten.mkldnn_rnn_layer.default,
+                torch.ops.aten._upsample_bilinear2d_aa.default,
+                torch.ops.aten.quantize_per_tensor.default,
+                torch.ops.aten.dequantize.self,
+                torch.ops.aten.max.default,  # TODO(T188268054)
+                torch.ops.aten.min.default,  # TODO(T188268054)
+                torch.ops.aten.full_like.default,  # TODO(T183507359)
+            ]
+            exception_list += self._exception_list
 
+            return exception_list
 
-def get_aten_verifier(enable: bool = True):
-    return EXIRATenDialectVerifier if enable else EXIRATenDialectVerifierBase
+        def check_valid_op(self, op):
+            if isinstance(op, OpOverload):
+                # TODO These special ops should be removable easily.
+                if op.namespace != "aten" or op in self._get_exception_list():
+                    return
+                if torch.Tag.core not in op.tags and torch.Tag.view_copy not in op.tags:
+                    # NOTE(qihan): whether view_copy operators are marked as canonical is still under
+                    #            discussion.
+                    raise SpecViolationError(
+                        f"Operator {op.__module__}.{op.__name__} is not Aten Canonical."
+                    )
+
+    ret = _EXIRATenDialectVerifier
+    if not class_only:
+        ret = ret()
+    return ret
+
+
+def get_aten_verifier(config: EdgeCompileConfig):
+    return (
+        EXIRATenDialectVerifier(
+            class_only=True, exception_list=config._core_aten_ops_exception_list
+        )
+        if config._check_ir_validity
+        else EXIRATenDialectVerifierBase
+    )
 
 
 def _get_inputs(graph_module: GraphModule) -> List[Optional[FakeTensor]]:
@@ -160,6 +184,12 @@ def EXIREdgeDialectVerifier(  # noqa: C901
     class_only: bool = False,
     exception_list: Optional[List[torch._ops.OpOverload]] = None,
 ):
+    # merge the exception list from edge_compile_config and exception_list
+    if edge_compile_config and edge_compile_config._core_aten_ops_exception_list:
+        exception_list = edge_compile_config._core_aten_ops_exception_list + (
+            exception_list or []
+        )
+
     class _EXIREdgeDialectVerifier(Verifier):
         dialect = "EDGE"
 
@@ -170,7 +200,9 @@ def __init__(self) -> None:
             self.check_edge_ops = _edge_compile_config._use_edge_ops
             self.use_dim_order = not _edge_compile_config._skip_dim_order
 
-            self.aten_op_verifier = EXIRATenDialectVerifier(exception_list)
+            self.aten_op_verifier = EXIRATenDialectVerifier(
+                exception_list=exception_list
+            )
             self.check_valid_aten_op = self.aten_op_verifier.check_valid_op
 
             if self.check_edge_ops:
diff --git a/extension/android/BUCK b/extension/android/BUCK
new file mode 100644
index 00000000000..c2d6359af75
--- /dev/null
+++ b/extension/android/BUCK
@@ -0,0 +1,36 @@
+load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library")
+
+oncall("executorch")
+
+fb_android_library(
+    name = "executorch",
+    srcs = [
+        "src/main/java/org/pytorch/executorch/DType.java",
+        "src/main/java/org/pytorch/executorch/EValue.java",
+        "src/main/java/org/pytorch/executorch/Module.java",
+        "src/main/java/org/pytorch/executorch/NativePeer.java",
+        "src/main/java/org/pytorch/executorch/Tensor.java",
+        "src/main/java/org/pytorch/executorch/annotations/Experimental.java",
+    ],
+    autoglob = False,
+    language = "JAVA",
+    deps = [
+        "//fbandroid/java/com/facebook/jni:jni",
+        "//fbandroid/libraries/soloader/java/com/facebook/soloader/nativeloader:nativeloader",
+    ],
+)
+
+fb_android_library(
+    name = "executorch_llama",
+    srcs = [
+        "src/main/java/org/pytorch/executorch/LlamaCallback.java",
+        "src/main/java/org/pytorch/executorch/LlamaModule.java",
+        "src/main/java/org/pytorch/executorch/annotations/Experimental.java",
+    ],
+    autoglob = False,
+    language = "JAVA",
+    deps = [
+        "//fbandroid/java/com/facebook/jni:jni",
+        "//fbandroid/libraries/soloader/java/com/facebook/soloader/nativeloader:nativeloader",
+    ],
+)
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 5d836171411..62cfa54558e 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -10,7 +10,6 @@ project(executorch_jni)
 
 if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
-  # Can't set to 11 due to executor_runner.cpp make_unique
 endif()
 
 if(NOT ANDROID)
@@ -31,9 +30,19 @@ set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../lib/cmake/ExecuTorch)
 find_package(executorch CONFIG REQUIRED)
 target_link_options_shared_lib(executorch)
 
+add_library(executorch_jni SHARED jni/jni_layer.cpp)
+
 set(link_libraries)
-list(APPEND link_libraries extension_data_loader extension_module executorch
-     fbjni
+list(
+  APPEND
+  link_libraries
+  executorch
+  extension_data_loader
+  extension_module
+  extension_runner_util
+  extension_tensor
+  extension_threadpool
+  fbjni
 )
 
 if(TARGET optimized_native_cpu_ops_lib)
@@ -51,97 +60,76 @@ else()
   list(APPEND link_libraries portable_ops_lib portable_kernels)
   target_link_options_shared_lib(portable_ops_lib)
 endif()
+
+if(TARGET quantized_kernels)
+  list(APPEND link_libraries quantized_kernels quantized_ops_lib)
+  target_link_options_shared_lib(quantized_ops_lib)
+endif()
+
 if(TARGET qnn_executorch_backend)
   list(APPEND link_libraries qnn_executorch_backend)
 endif()
+
 if(TARGET xnnpack_backend)
   target_link_options_shared_lib(xnnpack_backend)
   list(APPEND link_libraries xnnpack_backend XNNPACK pthreadpool cpuinfo)
 endif()
+
 if(TARGET vulkan_backend)
   target_link_options_shared_lib(vulkan_backend)
   list(APPEND link_libraries vulkan_backend)
 endif()
 
-add_library(executorch_jni SHARED jni/jni_layer.cpp)
-target_link_libraries(executorch_jni ${link_libraries})
-target_include_directories(
-  executorch_jni PRIVATE ${_common_include_directories}
-)
-target_compile_options(executorch_jni PUBLIC ${_common_compile_options})
+if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
+  add_subdirectory(
+    ${EXECUTORCH_ROOT}/extension/llm/custom_ops
+    ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/custom_ops
+  )
+  list(APPEND link_libraries custom_ops)
+  target_link_options_shared_lib(custom_ops)
+endif()
 
-if(EXECUTORCH_BUILD_LLAMA_JNI)
-  set(LLAMA_RUNNER_PATH
-      ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/runner/libllama_runner.a
+if(TARGET pthreadpool)
+  target_compile_definitions(executorch_jni PRIVATE ET_USE_THREADPOOL=1)
+  target_include_directories(
+    executorch_jni
+    PUBLIC
+      ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/cpuinfo/include
+  )
+  target_include_directories(
+    executorch_jni
+    PUBLIC
+      ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/pthreadpool/include
   )
-  add_library(llama_runner STATIC IMPORTED)
-  set_property(
-    TARGET llama_runner PROPERTY IMPORTED_LOCATION ${LLAMA_RUNNER_PATH}
+endif()
+
+if(EXECUTORCH_JNI_CUSTOM_LIBRARY)
+  list(APPEND link_libraries ${EXECUTORCH_JNI_CUSTOM_LIBRARY})
+  target_link_libraries(
+    executorch_jni -Wl,--whole-archive ${EXECUTORCH_JNI_CUSTOM_LIBRARY}
+    -Wl,--no-whole-archive
   )
+endif()
 
+if(EXECUTORCH_BUILD_LLAMA_JNI)
+  target_sources(executorch_jni PRIVATE jni/jni_layer_llama.cpp)
+  list(APPEND link_libraries llama_runner llava_runner)
+  target_compile_definitions(executorch_jni PUBLIC EXECUTORCH_BUILD_LLAMA_JNI=1)
   add_subdirectory(
     ${EXECUTORCH_ROOT}/examples/models/llava/runner
     ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llava/runner
   )
 
-  set(CUSTOM_OPS_PATH
-      ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/custom_ops/libcustom_ops.a
+  add_subdirectory(
+    ${EXECUTORCH_ROOT}/examples/models/llama2/runner
+    ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/runner
   )
-  add_library(custom_ops STATIC IMPORTED)
-  set_property(TARGET custom_ops PROPERTY IMPORTED_LOCATION ${CUSTOM_OPS_PATH})
-  target_link_options_shared_lib(custom_ops)
+endif()
 
-  target_link_options_shared_lib(quantized_ops_lib)
+target_include_directories(
+  executorch_jni PRIVATE ${_common_include_directories}
+)
 
-  if(TARGET pthreadpool)
-    set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp
-                       ../../backends/xnnpack/threadpool/cpuinfo_utils.cpp
-    )
-  else()
-    set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp)
-  endif()
-  add_library(executorch_llama_jni SHARED ${LLAMA_JNI_SRCS})
-  if(TARGET pthreadpool)
-    target_compile_definitions(executorch_llama_jni PRIVATE ET_USE_THREADPOOL=1)
-    target_include_directories(
-      executorch_llama_jni
-      PUBLIC
-        ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/cpuinfo/include
-    )
-    target_include_directories(
-      executorch_llama_jni
-      PUBLIC
-        ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/pthreadpool/include
-    )
-  endif()
-  target_include_directories(
-    executorch_llama_jni PRIVATE ${_common_include_directories}
-  )
-  target_link_libraries(
-    executorch_llama_jni
-    ${link_libraries}
-    llama_runner
-    llava_runner
-    custom_ops
-    cpublas
-    eigen_blas
-    quantized_kernels
-    quantized_ops_lib
-  )
-  target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options})
-  if(EXECUTORCH_USE_TIKTOKEN)
-    set(ABSL_ENABLE_INSTALL ON)
-    set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
-    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-    add_subdirectory(
-      ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp
-      ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
-    )
-    add_subdirectory(
-      ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2
-      ${CMAKE_CURRENT_BINARY_DIR}/re2
-    )
-    set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
-    target_link_libraries(executorch_llama_jni re2::re2)
-  endif()
-endif()
+target_compile_options(executorch_jni PUBLIC ${_common_compile_options})
+
+target_link_libraries(executorch_jni ${link_libraries})
diff --git a/extension/android/benchmark/.gitignore b/extension/android/benchmark/.gitignore
new file mode 100644
index 00000000000..0d02171028f
--- /dev/null
+++ b/extension/android/benchmark/.gitignore
@@ -0,0 +1,16 @@
+*.iml
+.gradle
+/local.properties
+/.idea/caches
+/.idea/libraries
+/.idea/modules.xml
+/.idea/workspace.xml
+/.idea/navEditor.xml
+/.idea/assetWizardSettings.xml
+.DS_Store
+/build
+/captures
+.externalNativeBuild
+.cxx
+local.properties
+*.aar
diff --git a/extension/android/benchmark/README.md b/extension/android/benchmark/README.md
new file mode 100644
index 00000000000..cfc5ef0e594
--- /dev/null
+++ b/extension/android/benchmark/README.md
@@ -0,0 +1,60 @@
+Minibench: ExecuTorch Android Benchmark App
+===
+
+Minibench is a benchmarking app for testing the performance of the ExecuTorch runtime on Android devices.
+
+It supports both generic (vision, audio, etc) models and LLM.
+
+- For generic model, it reports metrics such as model load time, and average inference time.
+- For LLM, it reports metrics such as model load time, and tokens per second.
+- We are working on providing more metrics in the future.
+
+Minibench is usedful for giving reference performance data when developers integrate ExecuTorch with their own Android app.
+
+## Build
+You will need executorch AAR for Java and JNI dependencies.
+```
+export ANDROID_NDK=<path_to_android_ndk>
+sh build/build_android_llm_demo.sh
+```
+and copy the AAR to `app/libs`.
+```
+mkdir -p app/libs
+cp $BUILD_AAR_DIR/executorch.aar app/libs
+```
+
+You can also refer to [this script](https://github.com/pytorch/executorch/blob/62024d8/.github/workflows/android-perf.yml#L226-L235) to see how it is built.
+
+Then you can build and install the app on Android Studio, or simply run
+```
+./gradlew installDebug
+```
+
+## Usage
+This apk does not come with a launcher icon. Instead, trigger it from command line
+
+### Push model to a directory
+```
+adb shell mkdir /data/local/tmp/minibench
+adb push my_model.pte /data/local/tmp/minibench
+# optionally, push tokenizer for LLM
+adb push tokenizer.bin /data/local/tmp/minibench
+```
+
+### Generic model
+```
+adb shell am start -W -S -n org.pytorch.minibench/org.pytorch.minibench.LlmBenchmarkActivity \
+ --es model_dir /data/local/tmp/minibench
+```
+
+### LLM
+```
+adb shell am start -W -S -n org.pytorch.minibench/org.pytorch.minibench.LlmBenchmarkActivity \
+ --es model_dir /data/local/tmp/minibench --es tokenizer_path /data/local/tmp/minibench/tokenizer.bin
+```
+
+### Fetch results
+```
+adb shell run-as org.pytorch.minibench cat files/benchmark_results.json
+```
+If the ExecuTorch runner is initialized and loads your model, but there is a load error or run error, you will see error code from that JSON.
diff --git a/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml b/extension/android/benchmark/android-llm-device-farm-test-spec.yml
similarity index 58%
rename from examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
rename to extension/android/benchmark/android-llm-device-farm-test-spec.yml
index cac83b8e6f5..ffb528767a5 100644
--- a/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
+++ b/extension/android/benchmark/android-llm-device-farm-test-spec.yml
@@ -10,18 +10,21 @@ phases:
     commands:
       # Prepare the model and the tokenizer
       - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /sdcard/"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "mkdir -p /data/local/tmp/llama/"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.bin /data/local/tmp/llama/"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.pte /data/local/tmp/llama/"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/*.bin"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/*.pte"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /data/local/tmp/llama/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "mkdir -p /data/local/tmp/minibench/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.bin /data/local/tmp/minibench/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.pte /data/local/tmp/minibench/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/minibench/*.bin"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/minibench/*.pte"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /data/local/tmp/minibench/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "run-as org.pytorch.minibench rm -rf files"
 
   test:
     commands:
       # By default, the following ADB command is used by Device Farm to run your Instrumentation test.
       # Please refer to Android's documentation for more options on running instrumentation tests with adb:
       # https://developer.android.com/studio/test/command-line#run-tests-with-adb
+
+      # Run the Instrumentation test for sanity check
       - echo "Starting the Instrumentation test"
       - |
         adb -s $DEVICEFARM_DEVICE_UDID shell "am instrument -r -w --no-window-animation \
@@ -63,18 +66,49 @@ phases:
         # Check for this last to make sure that there is no failure
         elif [ $TESTS_PASSED -ne 0 ];
         then
-          OBSERVED_TPS=$(grep "INSTRUMENTATION_STATUS: TPS=" $INSTRUMENT_LOG | tail -n 1)
-
-          if [ -n "${OBSERVED_TPS}" ];
-          then
-            echo "[PyTorch] ${OBSERVED_TPS}";
-          else
-            echo "[PyTorch] Test passes but couldn't find the observed TPS from instrument log";
-          fi
+          cat "${INSTRUMENT_LOG}"
         fi;
 
+      # Run the new generic benchmark activity https://developer.android.com/tools/adb#am
+      - echo "Determine model type"
+      - |
+        BIN_FOUND="$(adb -s $DEVICEFARM_DEVICE_UDID shell find /data/local/tmp/minibench/ -name '*.bin')"
+        if [ -z "$BIN_FOUND" ]; then
+          echo "No tokenizer files found in /data/local/tmp/minibench/"
+        else
+          echo "tokenizer files found in /data/local/tmp/minibench/"
+        fi
+
+      - echo "Run benchmark"
+      - |
+        adb -s $DEVICEFARM_DEVICE_UDID shell am force-stop org.pytorch.minibench
+        if [ -z "$BIN_FOUND" ]; then
+          adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n org.pytorch.minibench/.BenchmarkActivity \
+            --es "model_dir" "/data/local/tmp/minibench"
+        else
+          adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n org.pytorch.minibench/.LlmBenchmarkActivity \
+            --es "model_dir" "/data/local/tmp/minibench" \
+            --es "tokenizer_path" "/data/local/tmp/minibench/tokenizer.bin"
+        fi
+
+
   post_test:
     commands:
+      - echo "Gather benchmark results"
+      - |
+        BENCHMARK_RESULTS=$(adb -s $DEVICEFARM_DEVICE_UDID shell run-as org.pytorch.minibench cat files/benchmark_results.json)
+        ATTEMPT=0
+        MAX_ATTEMPT=10
+        while [ -z "${BENCHMARK_RESULTS}" ] && [ $ATTEMPT -lt $MAX_ATTEMPT ]; do
+          echo "Waiting for benchmark results..."
+          BENCHMARK_RESULTS=$(adb -s $DEVICEFARM_DEVICE_UDID shell run-as org.pytorch.minibench cat files/benchmark_results.json)
+          sleep 30
+          ((ATTEMPT++))
+        done
+
+        adb -s $DEVICEFARM_DEVICE_UDID shell run-as org.pytorch.minibench ls -la files/
+        # Trying to pull the file using adb ends up with permission error, but this works too, so why not
+        echo "${BENCHMARK_RESULTS}" > $DEVICEFARM_LOG_DIR/benchmark_results.json
 
 artifacts:
   # By default, Device Farm will collect your artifacts from the $DEVICEFARM_LOG_DIR directory.
diff --git a/extension/android/benchmark/app/.gitignore b/extension/android/benchmark/app/.gitignore
new file mode 100644
index 00000000000..796b96d1c40
--- /dev/null
+++ b/extension/android/benchmark/app/.gitignore
@@ -0,0 +1 @@
+/build
diff --git a/extension/android/benchmark/app/build.gradle.kts b/extension/android/benchmark/app/build.gradle.kts
new file mode 100644
index 00000000000..dcf99ca9cd0
--- /dev/null
+++ b/extension/android/benchmark/app/build.gradle.kts
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+plugins { id("com.android.application") }
+
+android {
+  namespace = "org.pytorch.minibench"
+  compileSdk = 34
+
+  defaultConfig {
+    applicationId = "org.pytorch.minibench"
+    minSdk = 28
+    targetSdk = 33
+    versionCode = 1
+    versionName = "1.0"
+
+    testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
+  }
+
+  buildTypes {
+    release {
+      isMinifyEnabled = false
+      proguardFiles(getDefaultProguardFile("proguard-android-optimize.txt"), "proguard-rules.pro")
+    }
+  }
+  compileOptions {
+    sourceCompatibility = JavaVersion.VERSION_1_8
+    targetCompatibility = JavaVersion.VERSION_1_8
+  }
+}
+
+dependencies {
+  implementation(files("libs/executorch.aar"))
+  implementation("com.facebook.soloader:soloader:0.10.5")
+  implementation("com.facebook.fbjni:fbjni:0.5.1")
+  implementation("com.google.code.gson:gson:2.8.6")
+  testImplementation("junit:junit:4.13.2")
+  androidTestImplementation("androidx.test.ext:junit:1.2.1")
+  androidTestImplementation("androidx.test.espresso:espresso-core:3.6.1")
+}
diff --git a/extension/android/benchmark/app/proguard-rules.pro b/extension/android/benchmark/app/proguard-rules.pro
new file mode 100644
index 00000000000..481bb434814
--- /dev/null
+++ b/extension/android/benchmark/app/proguard-rules.pro
@@ -0,0 +1,21 @@
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
\ No newline at end of file
diff --git a/extension/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/ExampleInstrumentedTest.java b/extension/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/ExampleInstrumentedTest.java
new file mode 100644
index 00000000000..9de66835885
--- /dev/null
+++ b/extension/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/ExampleInstrumentedTest.java
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.minibench;
+
+import static org.junit.Assert.*;
+
+import android.content.Context;
+import androidx.test.ext.junit.runners.AndroidJUnit4;
+import androidx.test.platform.app.InstrumentationRegistry;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+/**
+ * Instrumented test, which will execute on an Android device.
+ *
+ * @see <a href="http://d.android.com/tools/testing">Testing documentation</a>
+ */
+@RunWith(AndroidJUnit4.class)
+public class ExampleInstrumentedTest {
+  @Test
+  public void useAppContext() {
+    // Context of the app under test.
+    Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext();
+    assertEquals("org.pytorch.minibench", appContext.getPackageName());
+  }
+}
diff --git a/extension/android/benchmark/app/src/main/AndroidManifest.xml b/extension/android/benchmark/app/src/main/AndroidManifest.xml
new file mode 100644
index 00000000000..7f62c509d55
--- /dev/null
+++ b/extension/android/benchmark/app/src/main/AndroidManifest.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools">
+
+    <application
+        android:extractNativeLibs="true"
+        android:label="@string/app_name"
+        android:supportsRtl="true"
+        android:theme="@style/Theme.MiniBench"
+        tools:targetApi="31">
+
+        <uses-native-library
+            android:name="libcdsprpc.so"
+            android:required="false" />
+
+        <activity
+            android:name=".BenchmarkActivity"
+            android:exported="true">
+            <intent-filter>
+                <action android:name="org.pytorch.minibench.BENCHMARK" />
+            </intent-filter>
+        </activity>
+
+        <activity
+            android:name=".LlmBenchmarkActivity"
+            android:exported="true">
+            <intent-filter>
+                <action android:name="org.pytorch.minibench.BENCHMARK" />
+            </intent-filter>
+        </activity>
+
+    </application>
+
+</manifest>
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
new file mode 100644
index 00000000000..2397bcfb851
--- /dev/null
+++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.minibench;
+
+import android.app.Activity;
+import android.content.Intent;
+import android.os.Bundle;
+import android.system.ErrnoException;
+import android.system.Os;
+import com.google.gson.Gson;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+import org.pytorch.executorch.Module;
+
+public class BenchmarkActivity extends Activity {
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+
+    try {
+      Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
+    } catch (ErrnoException e) {
+      finish();
+    }
+
+    Intent intent = getIntent();
+    File modelDir = new File(intent.getStringExtra("model_dir"));
+    File model =
+        Arrays.stream(modelDir.listFiles())
+            .filter(file -> file.getName().endsWith(".pte"))
+            .findFirst()
+            .get();
+
+    int numIter = intent.getIntExtra("num_iter", 10);
+
+    // TODO: Format the string with a parsable format
+    Stats stats = new Stats();
+
+    // Record the time it takes to load the model and the forward method
+    stats.loadStart = System.nanoTime();
+    Module module = Module.load(model.getPath());
+    stats.errorCode = module.loadMethod("forward");
+    stats.loadEnd = System.nanoTime();
+
+    for (int i = 0; i < numIter; i++) {
+      long start = System.nanoTime();
+      module.forward();
+      double forwardMs = (System.nanoTime() - start) * 1e-6;
+      stats.latency.add(forwardMs);
+    }
+
+    final BenchmarkMetric.BenchmarkModel benchmarkModel =
+        BenchmarkMetric.extractBackendAndQuantization(model.getName().replace(".pte", ""));
+    final List<BenchmarkMetric> results = new ArrayList<>();
+    // The list of metrics we have atm includes:
+    // Avg inference latency after N iterations
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel,
+            "avg_inference_latency(ms)",
+            stats.latency.stream().mapToDouble(l -> l).average().orElse(0.0f),
+            0.0f));
+    // Model load time
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel, "model_load_time(ms)", (stats.loadEnd - stats.loadStart) * 1e-6, 0.0f));
+    // Load status
+    results.add(new BenchmarkMetric(benchmarkModel, "load_status", stats.errorCode, 0));
+
+    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
+      Gson gson = new Gson();
+      writer.write(gson.toJson(results));
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+  }
+}
+
+class Stats {
+  long loadStart;
+  long loadEnd;
+  List<Double> latency = new ArrayList<>();
+  int errorCode = 0;
+
+  @Override
+  public String toString() {
+    return "latency: " + latency.stream().map(Object::toString).collect(Collectors.joining(""));
+  }
+}
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.java
new file mode 100644
index 00000000000..22ee7b84804
--- /dev/null
+++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.minibench;
+
+import android.app.ActivityManager;
+import android.os.Build;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+class BenchmarkMetric {
+  public static class BenchmarkModel {
+    // The model name, i.e. stories110M
+    String name;
+    String backend;
+    String quantization;
+
+    public BenchmarkModel(final String name, final String backend, final String quantization) {
+      this.name = name;
+      this.backend = backend;
+      this.quantization = quantization;
+    }
+  }
+
+  BenchmarkModel benchmarkModel;
+
+  // The metric name, i.e. TPS
+  String metric;
+
+  // The actual value and the option target value
+  double actualValue;
+  double targetValue;
+
+  public static class DeviceInfo {
+    // Let's see which information we want to include here
+    final String device = Build.BRAND;
+    // The phone model and Android release version
+    final String arch = Build.MODEL;
+    final String os = "Android " + Build.VERSION.RELEASE;
+    final long totalMem = new ActivityManager.MemoryInfo().totalMem;
+    final long availMem = new ActivityManager.MemoryInfo().availMem;
+  }
+
+  DeviceInfo deviceInfo = new DeviceInfo();
+
+  public BenchmarkMetric(
+      final BenchmarkModel benchmarkModel,
+      final String metric,
+      final double actualValue,
+      final double targetValue) {
+    this.benchmarkModel = benchmarkModel;
+    this.metric = metric;
+    this.actualValue = actualValue;
+    this.targetValue = targetValue;
+  }
+
+  // TODO (huydhn): Figure out a way to extract the backend and quantization information from
+  // the .pte model itself instead of parsing its name
+  public static BenchmarkMetric.BenchmarkModel extractBackendAndQuantization(final String model) {
+    final Matcher m =
+        Pattern.compile("(?<name>\\w+)_(?<backend>\\w+)_(?<quantization>\\w+)").matcher(model);
+    if (m.matches()) {
+      return new BenchmarkMetric.BenchmarkModel(
+          m.group("name"), m.group("backend"), m.group("quantization"));
+    } else {
+      return new BenchmarkMetric.BenchmarkModel(model, "", "");
+    }
+  }
+}
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java
new file mode 100644
index 00000000000..3bc38aad403
--- /dev/null
+++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.minibench;
+
+import android.app.Activity;
+import android.content.Intent;
+import android.os.Bundle;
+import android.system.ErrnoException;
+import android.system.Os;
+import android.util.Log;
+import com.google.gson.Gson;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class LlmBenchmarkActivity extends Activity implements ModelRunnerCallback {
+  ModelRunner mModelRunner;
+
+  String mPrompt;
+  StatsInfo mStatsInfo;
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+
+    Intent intent = getIntent();
+
+    File modelDir = new File(intent.getStringExtra("model_dir"));
+    File model =
+        Arrays.stream(modelDir.listFiles())
+            .filter(file -> file.getName().endsWith(".pte"))
+            .findFirst()
+            .get();
+    String tokenizerPath = intent.getStringExtra("tokenizer_path");
+
+    float temperature = intent.getFloatExtra("temperature", 0.8f);
+    mPrompt = intent.getStringExtra("prompt");
+    if (mPrompt == null) {
+      mPrompt = "The ultimate answer";
+    }
+
+    try {
+      Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
+    } catch (ErrnoException e) {
+      finish();
+    }
+
+    mStatsInfo = new StatsInfo();
+    mStatsInfo.modelName = model.getName().replace(".pte", "");
+    mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this);
+    mStatsInfo.loadStart = System.nanoTime();
+  }
+
+  @Override
+  public void onModelLoaded(int status) {
+    mStatsInfo.loadEnd = System.nanoTime();
+    mStatsInfo.loadStatus = status;
+    if (status != 0) {
+      Log.e("LlmBenchmarkRunner", "Loaded failed: " + status);
+      onGenerationStopped();
+      return;
+    }
+    mStatsInfo.generateStart = System.nanoTime();
+    mModelRunner.generate(mPrompt);
+  }
+
+  @Override
+  public void onTokenGenerated(String token) {}
+
+  @Override
+  public void onStats(String stats) {
+    mStatsInfo.tokens = stats;
+  }
+
+  @Override
+  public void onGenerationStopped() {
+    mStatsInfo.generateEnd = System.nanoTime();
+
+    final BenchmarkMetric.BenchmarkModel benchmarkModel =
+        BenchmarkMetric.extractBackendAndQuantization(mStatsInfo.modelName);
+    final List<BenchmarkMetric> results = new ArrayList<>();
+    // The list of metrics we have atm includes:
+    // Load status
+    results.add(new BenchmarkMetric(benchmarkModel, "load_status", mStatsInfo.loadStatus, 0));
+    // Model load time
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel,
+            "model_load_time(ms)",
+            (mStatsInfo.loadEnd - mStatsInfo.loadStart) * 1e-6,
+            0.0f));
+    // LLM generate time
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel,
+            "generate_time(ms)",
+            (mStatsInfo.generateEnd - mStatsInfo.generateStart) * 1e-6,
+            0.0f));
+    // Token per second
+    results.add(
+        new BenchmarkMetric(benchmarkModel, "token_per_sec", extractTPS(mStatsInfo.tokens), 0.0f));
+
+    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
+      Gson gson = new Gson();
+      writer.write(gson.toJson(results));
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+  }
+
+  private double extractTPS(final String tokens) {
+    final Matcher m = Pattern.compile("\\d+\\.?\\d*").matcher(tokens);
+    if (m.find()) {
+      return Double.parseDouble(m.group());
+    } else {
+      return 0.0f;
+    }
+  }
+}
+
+class StatsInfo {
+  int loadStatus;
+  long loadStart;
+  long loadEnd;
+  long generateStart;
+  long generateEnd;
+  String tokens;
+  String modelName;
+
+  @Override
+  public String toString() {
+    return "loadStart: "
+        + loadStart
+        + "\nloadEnd: "
+        + loadEnd
+        + "\ngenerateStart: "
+        + generateStart
+        + "\ngenerateEnd: "
+        + generateEnd
+        + "\n"
+        + tokens;
+  }
+}
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java
new file mode 100644
index 00000000000..9e9b9e003d8
--- /dev/null
+++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.minibench;
+
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.os.Looper;
+import android.os.Message;
+import org.pytorch.executorch.LlamaCallback;
+import org.pytorch.executorch.LlamaModule;
+
+/** A helper class to handle all model running logic within this class. */
+public class ModelRunner implements LlamaCallback {
+  LlamaModule mModule = null;
+
+  String mModelFilePath = "";
+  String mTokenizerFilePath = "";
+
+  ModelRunnerCallback mCallback = null;
+
+  HandlerThread mHandlerThread = null;
+  Handler mHandler = null;
+
+  /**
+   * ] Helper class to separate between UI logic and model runner logic. Automatically handle
+   * generate() request on worker thread.
+   *
+   * @param modelFilePath
+   * @param tokenizerFilePath
+   * @param callback
+   */
+  ModelRunner(
+      String modelFilePath,
+      String tokenizerFilePath,
+      float temperature,
+      ModelRunnerCallback callback) {
+    mModelFilePath = modelFilePath;
+    mTokenizerFilePath = tokenizerFilePath;
+    mCallback = callback;
+
+    mModule = new LlamaModule(mModelFilePath, mTokenizerFilePath, 0.8f);
+    mHandlerThread = new HandlerThread("ModelRunner");
+    mHandlerThread.start();
+    mHandler = new ModelRunnerHandler(mHandlerThread.getLooper(), this);
+
+    mHandler.sendEmptyMessage(ModelRunnerHandler.MESSAGE_LOAD_MODEL);
+  }
+
+  int generate(String prompt) {
+    Message msg = Message.obtain(mHandler, ModelRunnerHandler.MESSAGE_GENERATE, prompt);
+    msg.sendToTarget();
+    return 0;
+  }
+
+  void stop() {
+    mModule.stop();
+  }
+
+  @Override
+  public void onResult(String result) {
+    mCallback.onTokenGenerated(result);
+  }
+
+  @Override
+  public void onStats(float tps) {
+    mCallback.onStats("tokens/second: " + tps);
+  }
+}
+
+class ModelRunnerHandler extends Handler {
+  public static int MESSAGE_LOAD_MODEL = 1;
+  public static int MESSAGE_GENERATE = 2;
+
+  private final ModelRunner mModelRunner;
+
+  public ModelRunnerHandler(Looper looper, ModelRunner modelRunner) {
+    super(looper);
+    mModelRunner = modelRunner;
+  }
+
+  @Override
+  public void handleMessage(android.os.Message msg) {
+    if (msg.what == MESSAGE_LOAD_MODEL) {
+      int status = mModelRunner.mModule.load();
+      mModelRunner.mCallback.onModelLoaded(status);
+    } else if (msg.what == MESSAGE_GENERATE) {
+      mModelRunner.mModule.generate((String) msg.obj, mModelRunner);
+      mModelRunner.mCallback.onGenerationStopped();
+    }
+  }
+}
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java
new file mode 100644
index 00000000000..63701a7bbc6
--- /dev/null
+++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.minibench;
+
+/**
+ * A helper interface within the app for MainActivity and Benchmarking to handle callback from
+ * ModelRunner.
+ */
+public interface ModelRunnerCallback {
+
+  void onModelLoaded(int status);
+
+  void onTokenGenerated(String token);
+
+  void onStats(String token);
+
+  void onGenerationStopped();
+}
diff --git a/extension/android/benchmark/app/src/main/res/values/colors.xml b/extension/android/benchmark/app/src/main/res/values/colors.xml
new file mode 100644
index 00000000000..ca1931bca99
--- /dev/null
+++ b/extension/android/benchmark/app/src/main/res/values/colors.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <color name="purple_200">#FFBB86FC</color>
+    <color name="purple_500">#FF6200EE</color>
+    <color name="purple_700">#FF3700B3</color>
+    <color name="teal_200">#FF03DAC5</color>
+    <color name="teal_700">#FF018786</color>
+    <color name="black">#FF000000</color>
+    <color name="white">#FFFFFFFF</color>
+</resources>
diff --git a/extension/android/benchmark/app/src/main/res/values/strings.xml b/extension/android/benchmark/app/src/main/res/values/strings.xml
new file mode 100644
index 00000000000..34062786b93
--- /dev/null
+++ b/extension/android/benchmark/app/src/main/res/values/strings.xml
@@ -0,0 +1,3 @@
+<resources>
+    <string name="app_name">MiniBench</string>
+</resources>
\ No newline at end of file
diff --git a/extension/android/benchmark/app/src/main/res/values/themes.xml b/extension/android/benchmark/app/src/main/res/values/themes.xml
new file mode 100644
index 00000000000..8a63cb8955a
--- /dev/null
+++ b/extension/android/benchmark/app/src/main/res/values/themes.xml
@@ -0,0 +1,5 @@
+<resources xmlns:tools="http://schemas.android.com/tools">
+    <!-- Base application theme. -->
+    <style name="Theme.MiniBench" parent="android:Theme.DeviceDefault">
+    </style>
+</resources>
diff --git a/extension/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java b/extension/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java
new file mode 100644
index 00000000000..c6a6a76a4d8
--- /dev/null
+++ b/extension/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.minibench;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+/**
+ * Example local unit test, which will execute on the development machine (host).
+ *
+ * @see <a href="http://d.android.com/tools/testing">Testing documentation</a>
+ */
+public class ExampleUnitTest {
+  @Test
+  public void addition_isCorrect() {
+    assertEquals(4, 2 + 2);
+  }
+}
diff --git a/extension/android/benchmark/build.gradle.kts b/extension/android/benchmark/build.gradle.kts
new file mode 100644
index 00000000000..ac625be8e02
--- /dev/null
+++ b/extension/android/benchmark/build.gradle.kts
@@ -0,0 +1,10 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+plugins { id("com.android.application") version "8.1.0" apply false }
diff --git a/extension/android/benchmark/gradle.properties b/extension/android/benchmark/gradle.properties
new file mode 100644
index 00000000000..a03b3548962
--- /dev/null
+++ b/extension/android/benchmark/gradle.properties
@@ -0,0 +1,21 @@
+# Project-wide Gradle settings.
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
+# AndroidX package structure to make it clearer which packages are bundled with the
+# Android operating system, and which are packaged with your app's APK
+# https://developer.android.com/topic/libraries/support-library/androidx-rn
+android.useAndroidX=true
+# Enables namespacing of each library's R class so that its R class includes only the
+# resources declared in the library itself and none from the library's dependencies,
+# thereby reducing the size of the R class for that library
+android.nonTransitiveRClass=true
diff --git a/extension/android/benchmark/gradle/wrapper/gradle-wrapper.jar b/extension/android/benchmark/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 00000000000..e708b1c023e
Binary files /dev/null and b/extension/android/benchmark/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/extension/android/benchmark/gradle/wrapper/gradle-wrapper.properties b/extension/android/benchmark/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 00000000000..ba68b6be2f9
--- /dev/null
+++ b/extension/android/benchmark/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Thu Aug 29 23:29:08 PDT 2024
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
diff --git a/extension/android/benchmark/gradlew b/extension/android/benchmark/gradlew
new file mode 100755
index 00000000000..4f906e0c811
--- /dev/null
+++ b/extension/android/benchmark/gradlew
@@ -0,0 +1,185 @@
+#!/usr/bin/env sh
+
+#
+# Copyright 2015 the original author or authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn () {
+    echo "$*"
+}
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin or MSYS, switch paths to Windows format before running java
+if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=`expr $i + 1`
+    done
+    case $i in
+        0) set -- ;;
+        1) set -- "$args0" ;;
+        2) set -- "$args0" "$args1" ;;
+        3) set -- "$args0" "$args1" "$args2" ;;
+        4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Escape application args
+save () {
+    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
+    echo " "
+}
+APP_ARGS=`save "$@"`
+
+# Collect all arguments for the java command, following the shell quoting and substitution rules
+eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+
+exec "$JAVACMD" "$@"
diff --git a/extension/android/benchmark/gradlew.bat b/extension/android/benchmark/gradlew.bat
new file mode 100644
index 00000000000..ac1b06f9382
--- /dev/null
+++ b/extension/android/benchmark/gradlew.bat
@@ -0,0 +1,89 @@
+@rem
+@rem Copyright 2015 the original author or authors.
+@rem
+@rem Licensed under the Apache License, Version 2.0 (the "License");
+@rem you may not use this file except in compliance with the License.
+@rem You may obtain a copy of the License at
+@rem
+@rem      https://www.apache.org/licenses/LICENSE-2.0
+@rem
+@rem Unless required by applicable law or agreed to in writing, software
+@rem distributed under the License is distributed on an "AS IS" BASIS,
+@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@rem See the License for the specific language governing permissions and
+@rem limitations under the License.
+@rem
+
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Resolve any "." and ".." in APP_HOME to make it shorter.
+for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto execute
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto execute
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/extension/android/benchmark/settings.gradle.kts b/extension/android/benchmark/settings.gradle.kts
new file mode 100644
index 00000000000..4afd7e2d388
--- /dev/null
+++ b/extension/android/benchmark/settings.gradle.kts
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+pluginManagement {
+  repositories {
+    google()
+    mavenCentral()
+    gradlePluginPortal()
+  }
+}
+
+dependencyResolutionManagement {
+  repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
+  repositories {
+    google()
+    mavenCentral()
+  }
+}
+
+rootProject.name = "MiniBench"
+
+include(":app")
diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK
new file mode 100644
index 00000000000..3c8f00b2bdc
--- /dev/null
+++ b/extension/android/jni/BUCK
@@ -0,0 +1,100 @@
+load("@fbsource//tools/build_defs/android:fb_android_cxx_library.bzl", "fb_android_cxx_library")
+load("@fbsource//xplat/executorch/codegen:codegen.bzl", "executorch_generated_lib")
+
+oncall("executorch")
+
+executorch_generated_lib(
+    name = "generated_op_lib_optimized",
+    custom_ops_aten_kernel_deps = [
+        "//executorch/kernels/portable:operators_aten",
+    ],
+    custom_ops_yaml_target = "//executorch/kernels/portable:custom_ops.yaml",
+    define_static_targets = True,
+    fallback_yaml_target = "//executorch/kernels/portable:functions.yaml",
+    functions_yaml_target = "//executorch/kernels/optimized:optimized.yaml",
+    visibility = ["PUBLIC"],
+    deps = [
+        "//executorch/kernels/optimized:optimized_operators",
+        "//executorch/kernels/optimized:optimized_oplist",
+        "//executorch/kernels/portable:executorch_aten_ops",
+        "//executorch/kernels/portable:executorch_custom_ops",
+        "//executorch/kernels/portable:operators",
+    ],
+)
+
+fb_android_cxx_library(
+    name = "executorch_jni",
+    srcs = ["jni_layer.cpp"],
+    headers = ["jni_layer_constants.h"],
+    allow_jni_merging = False,
+    compiler_flags = [
+        "-frtti",
+        "-fexceptions",
+        "-Wno-unused-variable",
+    ],
+    soname = "libexecutorch.$(ext)",
+    visibility = ["PUBLIC"],
+    deps = [
+        "//fbandroid/libraries/fbjni:fbjni",
+        "//fbandroid/native/fb:fb",
+        "//third-party/glog:glog",
+        "//xplat/executorch/extension/module:module_static",
+        "//xplat/executorch/extension/runner_util:inputs_static",
+        "//xplat/executorch/extension/tensor:tensor_static",
+    ],
+)
+
+fb_android_cxx_library(
+    name = "executorch_jni_full",
+    srcs = ["jni_layer.cpp"],
+    headers = ["jni_layer_constants.h"],
+    allow_jni_merging = False,
+    compiler_flags = [
+        "-frtti",
+        "-fexceptions",
+        "-Wno-unused-variable",
+    ],
+    soname = "libexecutorch.$(ext)",
+    visibility = ["PUBLIC"],
+    deps = [
+        ":generated_op_lib_optimized_static",
+        "//fbandroid/libraries/fbjni:fbjni",
+        "//fbandroid/native/fb:fb",
+        "//third-party/glog:glog",
+        "//xplat/executorch/backends/xnnpack:xnnpack_backend_static",
+        "//xplat/executorch/extension/module:module_static",
+        "//xplat/executorch/extension/runner_util:inputs_static",
+        "//xplat/executorch/extension/tensor:tensor_static",
+    ],
+)
+
+fb_android_cxx_library(
+    name = "executorch_llama_jni",
+    srcs = [
+        "jni_layer.cpp",
+        "jni_layer_llama.cpp",
+    ],
+    headers = ["jni_layer_constants.h"],
+    allow_jni_merging = False,
+    compiler_flags = [
+        "-frtti",
+        "-fexceptions",
+        "-DEXECUTORCH_BUILD_LLAMA_JNI",
+        "-Wno-format",
+    ],
+    soname = "libexecutorch.$(ext)",
+    visibility = ["PUBLIC"],
+    deps = [
+        "//fbandroid/libraries/fbjni:fbjni",
+        "//fbandroid/native/fb:fb",
+        "//third-party/glog:glog",
+        "//xplat/executorch/backends/xnnpack:xnnpack_backend_static",
+        "//xplat/executorch/examples/models/llama2/runner:runner_static",
+        "//xplat/executorch/examples/models/llava/runner:runner_static",
+        "//xplat/executorch/extension/module:module_static",
+        "//xplat/executorch/extension/runner_util:inputs_static",
+        "//xplat/executorch/extension/tensor:tensor_static",
+        "//xplat/executorch/extension/threadpool:cpuinfo_utils_static",
+        "//xplat/executorch/extension/threadpool:threadpool_static",
+    ],
+)
diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp
index c70912a5451..1ef81b20b08 100644
--- a/extension/android/jni/jni_layer.cpp
+++ b/extension/android/jni/jni_layer.cpp
@@ -18,7 +18,8 @@
 #include "jni_layer_constants.h"
 
 #include <executorch/extension/module/module.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/extension/runner_util/inputs.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/portable_type/tensor_impl.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
@@ -54,9 +55,10 @@ void et_pal_emit_log_message(
 }
 #endif
 
+using namespace executorch::extension;
 using namespace torch::executor;
 
-namespace executorch_jni {
+namespace executorch::extension {
 class TensorHybrid : public facebook::jni::HybridClass<TensorHybrid> {
  public:
   constexpr static const char* kJavaDescriptor =
@@ -166,7 +168,7 @@ class JEValue : public facebook::jni::JavaClass<JEValue> {
         evalue.tag);
   }
 
-  static ManagedTensor JEValueToTensorImpl(
+  static TensorPtr JEValueToTensorImpl(
       facebook::jni::alias_ref<JEValue> JEValue) {
     static const auto typeCodeField =
         JEValue::javaClassStatic()->getField<jint>("mTypeCode");
@@ -220,7 +222,7 @@ class JEValue : public facebook::jni::JavaClass<JEValue> {
             numel,
             dataCapacity);
       }
-      return ManagedTensor(
+      return from_blob(
           jni->GetDirectBufferAddress(jbuffer.get()), shape_vec, scalar_type);
     }
     facebook::jni::throwNewJavaException(
@@ -292,9 +294,31 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
       facebook::jni::alias_ref<
           facebook::jni::JArrayClass<JEValue::javaobject>::javaobject>
           jinputs) {
-    std::vector<EValue> evalues = {};
+    // If no inputs is given, it will run with sample inputs (ones)
+    if (jinputs->size() == 0) {
+      if (module_->load_method(method) != Error::Ok) {
+        return {};
+      }
+      auto&& underlying_method = module_->methods_[method].method;
+      auto&& buf = prepare_input_tensors(*underlying_method);
+      auto result = underlying_method->execute();
+      if (result != Error::Ok) {
+        return {};
+      }
+      facebook::jni::local_ref<facebook::jni::JArrayClass<JEValue>> jresult =
+          facebook::jni::JArrayClass<JEValue>::newArray(
+              underlying_method->outputs_size());
+
+      for (int i = 0; i < underlying_method->outputs_size(); i++) {
+        auto jevalue =
+            JEValue::newJEValueFromEValue(underlying_method->get_output(i));
+        jresult->setElement(i, *jevalue);
+      }
+      return jresult;
+    }
 
-    std::vector<ManagedTensor> managed_tensors = {};
+    std::vector<EValue> evalues;
+    std::vector<TensorPtr> tensors;
 
     static const auto typeCodeField =
         JEValue::javaClassStatic()->getField<jint>("mTypeCode");
@@ -303,18 +327,17 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
       auto jevalue = jinputs->getElement(i);
       const auto typeCode = jevalue->getFieldValue(typeCodeField);
       if (typeCode == JEValue::kTypeCodeTensor) {
-        managed_tensors.emplace_back(JEValue::JEValueToTensorImpl(jevalue));
-        evalues.emplace_back(
-            EValue(managed_tensors.back().get_aliasing_tensor()));
+        tensors.emplace_back(JEValue::JEValueToTensorImpl(jevalue));
+        evalues.emplace_back(tensors.back());
       } else if (typeCode == JEValue::kTypeCodeInt) {
         int64_t value = jevalue->getFieldValue(typeCodeField);
-        evalues.emplace_back(EValue(value));
+        evalues.emplace_back(value);
       } else if (typeCode == JEValue::kTypeCodeDouble) {
         double value = jevalue->getFieldValue(typeCodeField);
-        evalues.emplace_back(EValue(value));
+        evalues.emplace_back(value);
       } else if (typeCode == JEValue::kTypeCodeBool) {
         bool value = jevalue->getFieldValue(typeCodeField);
-        evalues.emplace_back(EValue(value));
+        evalues.emplace_back(value);
       }
     }
 
@@ -361,10 +384,17 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
     });
   }
 };
+} // namespace executorch::extension
 
-} // namespace executorch_jni
-
+#ifdef EXECUTORCH_BUILD_LLAMA_JNI
+extern void register_natives_for_llama();
+#else
+// No op if we don't build llama
+void register_natives_for_llama() {}
+#endif
 JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void*) {
-  return facebook::jni::initialize(
-      vm, [] { executorch_jni::ExecuTorchJni::registerNatives(); });
+  return facebook::jni::initialize(vm, [] {
+    executorch::extension::ExecuTorchJni::registerNatives();
+    register_natives_for_llama();
+  });
 }
diff --git a/extension/android/jni/jni_layer_constants.h b/extension/android/jni/jni_layer_constants.h
index ac52b3a650d..b710dbe8e08 100644
--- a/extension/android/jni/jni_layer_constants.h
+++ b/extension/android/jni/jni_layer_constants.h
@@ -8,9 +8,9 @@
 
 #include <unordered_map>
 
-#include <executorch/runtime/core/portable_type/scalar_type.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 
-namespace executorch_jni {
+namespace executorch::extension {
 
 constexpr static int kTensorDTypeUInt8 = 0;
 constexpr static int kTensorDTypeInt8 = 1;
@@ -37,7 +37,7 @@ constexpr static int kTensorDTypeBits4x2 = 20;
 constexpr static int kTensorDTypeBits8 = 21;
 constexpr static int kTensorDTypeBits16 = 22;
 
-using torch::executor::ScalarType;
+using exec_aten::ScalarType;
 
 const std::unordered_map<ScalarType, int> scalar_type_to_java_dtype = {
     {ScalarType::Byte, kTensorDTypeUInt8},
@@ -93,4 +93,4 @@ const std::unordered_map<int, ScalarType> java_dtype_to_scalar_type = {
     {kTensorDTypeBits16, ScalarType::Bits16},
 };
 
-} // namespace executorch_jni
+} // namespace executorch::extension
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index f3cb6103c83..de585beafde 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -23,41 +23,15 @@
 #include <executorch/runtime/platform/runtime.h>
 
 #if defined(ET_USE_THREADPOOL)
-#include <executorch/backends/xnnpack/threadpool/cpuinfo_utils.h>
-#include <executorch/backends/xnnpack/threadpool/threadpool.h>
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
+#include <executorch/extension/threadpool/threadpool.h>
 #endif
 
 #include <fbjni/ByteBuffer.h>
 #include <fbjni/fbjni.h>
 
-#ifdef __ANDROID__
-#include <android/log.h>
-
-// For Android, write to logcat
-void et_pal_emit_log_message(
-    et_timestamp_t timestamp,
-    et_pal_log_level_t level,
-    const char* filename,
-    const char* function,
-    size_t line,
-    const char* message,
-    size_t length) {
-  int android_log_level = ANDROID_LOG_UNKNOWN;
-  if (level == 'D') {
-    android_log_level = ANDROID_LOG_DEBUG;
-  } else if (level == 'I') {
-    android_log_level = ANDROID_LOG_INFO;
-  } else if (level == 'E') {
-    android_log_level = ANDROID_LOG_ERROR;
-  } else if (level == 'F') {
-    android_log_level = ANDROID_LOG_FATAL;
-  }
-
-  __android_log_print(android_log_level, "LLAMA", "%s", message);
-}
-#endif
-
-using namespace torch::executor;
+namespace llm = ::executorch::extension::llm;
+using ::executorch::runtime::Error;
 
 namespace executorch_jni {
 
@@ -75,7 +49,7 @@ class ExecuTorchLlamaCallbackJni
     method(self(), s);
   }
 
-  void onStats(const Stats& result) const {
+  void onStats(const llm::Stats& result) const {
     static auto cls = ExecuTorchLlamaCallbackJni::javaClassStatic();
     static const auto method = cls->getMethod<void(jfloat)>("onStats");
     double eval_time =
@@ -93,8 +67,8 @@ class ExecuTorchLlamaJni
  private:
   friend HybridBase;
   int model_type_category_;
-  std::unique_ptr<Runner> runner_;
-  std::unique_ptr<MultimodalRunner> multi_modal_runner_;
+  std::unique_ptr<example::Runner> runner_;
+  std::unique_ptr<llm::MultimodalRunner> multi_modal_runner_;
 
  public:
   constexpr static auto kJavaDescriptor =
@@ -131,12 +105,12 @@ class ExecuTorchLlamaJni
 
     model_type_category_ = model_type_category;
     if (model_type_category == MODEL_TYPE_CATEGORY_MULTIMODAL) {
-      multi_modal_runner_ = std::make_unique<LlavaRunner>(
+      multi_modal_runner_ = std::make_unique<example::LlavaRunner>(
           model_path->toStdString().c_str(),
           tokenizer_path->toStdString().c_str(),
           temperature);
     } else if (model_type_category == MODEL_TYPE_CATEGORY_LLM) {
-      runner_ = std::make_unique<Runner>(
+      runner_ = std::make_unique<example::Runner>(
           model_path->toStdString().c_str(),
           tokenizer_path->toStdString().c_str(),
           temperature);
@@ -150,10 +124,11 @@ class ExecuTorchLlamaJni
       jint channels,
       facebook::jni::alias_ref<jstring> prompt,
       jint seq_len,
-      facebook::jni::alias_ref<ExecuTorchLlamaCallbackJni> callback) {
+      facebook::jni::alias_ref<ExecuTorchLlamaCallbackJni> callback,
+      jboolean echo) {
     if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
       auto image_size = image->size();
-      std::vector<Image> images;
+      std::vector<llm::Image> images;
       if (image_size != 0) {
         std::vector<jint> image_data_jint(image_size);
         std::vector<uint8_t> image_data(image_size);
@@ -161,25 +136,107 @@ class ExecuTorchLlamaJni
         for (int i = 0; i < image_size; i++) {
           image_data[i] = image_data_jint[i];
         }
-        Image image_runner{image_data, width, height, channels};
+        llm::Image image_runner{image_data, width, height, channels};
         images.push_back(image_runner);
       }
       multi_modal_runner_->generate(
-          images,
+          std::move(images),
           prompt->toStdString(),
           seq_len,
           [callback](std::string result) { callback->onResult(result); },
-          [callback](const Stats& result) { callback->onStats(result); });
+          [callback](const llm::Stats& result) { callback->onStats(result); },
+          echo);
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
       runner_->generate(
           prompt->toStdString(),
           seq_len,
           [callback](std::string result) { callback->onResult(result); },
-          [callback](const Stats& result) { callback->onStats(result); });
+          [callback](const llm::Stats& result) { callback->onStats(result); },
+          echo);
     }
     return 0;
   }
 
+  // Returns a tuple of (error, start_pos)
+  // Contract is valid within an AAR (JNI + corresponding Java code)
+  // If the first element is not Error::Ok, the other element is undefined.
+  facebook::jni::local_ref<jlongArray> prefill_prompt(
+      facebook::jni::alias_ref<jstring> prompt,
+      jlong start_pos,
+      jint bos,
+      jint eos) {
+    facebook::jni::local_ref<jlongArray> tuple_result =
+        facebook::jni::make_long_array(2);
+    if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) {
+      tuple_result->pin()[0] = static_cast<jint>(Error::NotSupported);
+      return tuple_result;
+    }
+
+    auto&& result = multi_modal_runner_->prefill_prompt(
+        prompt->toStdString(), start_pos, bos, eos);
+    tuple_result->pin()[0] = static_cast<jint>(Error::Ok);
+    if (result.ok()) {
+      tuple_result->pin()[1] = static_cast<jlong>(start_pos);
+    }
+    return tuple_result;
+  }
+
+  // Returns a tuple of (error, start_pos)
+  // Contract is valid within an AAR (JNI + corresponding Java code)
+  // If the first element is not Error::Ok, the other element is undefined.
+
+  facebook::jni::local_ref<jlongArray> prefill_images(
+      facebook::jni::alias_ref<jintArray> image,
+      jint width,
+      jint height,
+      jint channels,
+      jlong start_pos) {
+    facebook::jni::local_ref<jlongArray> tuple_result =
+        facebook::jni::make_long_array(2);
+
+    if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) {
+      tuple_result->pin()[0] = static_cast<jint>(Error::NotSupported);
+      return tuple_result;
+    }
+
+    auto image_size = image->size();
+    std::vector<llm::Image> images;
+    if (image_size != 0) {
+      std::vector<jint> image_data_jint(image_size);
+      std::vector<uint8_t> image_data(image_size);
+      image->getRegion(0, image_size, image_data_jint.data());
+      for (int i = 0; i < image_size; i++) {
+        image_data[i] = image_data_jint[i];
+      }
+      llm::Image image_runner{image_data, width, height, channels};
+      images.push_back(image_runner);
+    }
+    // TODO(hsz): make  start_pos a reference and update it here
+    jint result = static_cast<jint>(
+        multi_modal_runner_->prefill_images(images, start_pos));
+    tuple_result->pin()[0] = result;
+    tuple_result->pin()[1] = static_cast<jlong>(start_pos);
+    return tuple_result;
+  }
+
+  jint generate_from_pos(
+      facebook::jni::alias_ref<jstring> prompt,
+      jint seq_len,
+      jlong start_pos,
+      facebook::jni::alias_ref<ExecuTorchLlamaCallbackJni> callback,
+      jboolean echo) {
+    if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) {
+      return static_cast<jint>(Error::NotSupported);
+    }
+    return static_cast<jint>(multi_modal_runner_->generate_from_pos(
+        prompt->toStdString(),
+        seq_len,
+        start_pos,
+        [callback](const std::string& result) { callback->onResult(result); },
+        [callback](const llm::Stats& stats) { callback->onStats(stats); },
+        echo));
+  }
+
   void stop() {
     if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
       multi_modal_runner_->stop();
@@ -203,13 +260,18 @@ class ExecuTorchLlamaJni
         makeNativeMethod("generate", ExecuTorchLlamaJni::generate),
         makeNativeMethod("stop", ExecuTorchLlamaJni::stop),
         makeNativeMethod("load", ExecuTorchLlamaJni::load),
+        makeNativeMethod(
+            "prefillImagesNative", ExecuTorchLlamaJni::prefill_images),
+        makeNativeMethod(
+            "prefillPromptNative", ExecuTorchLlamaJni::prefill_prompt),
+        makeNativeMethod(
+            "generateFromPos", ExecuTorchLlamaJni::generate_from_pos),
     });
   }
 };
 
 } // namespace executorch_jni
 
-JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void*) {
-  return facebook::jni::initialize(
-      vm, [] { executorch_jni::ExecuTorchLlamaJni::registerNatives(); });
+void register_natives_for_llama() {
+  executorch_jni::ExecuTorchLlamaJni::registerNatives();
 }
diff --git a/extension/android/src/main/java/org/pytorch/executorch/DType.java b/extension/android/src/main/java/org/pytorch/executorch/DType.java
index 8b3fb42a6ad..f5d33d0b71e 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/DType.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/DType.java
@@ -8,56 +8,63 @@
 
 package org.pytorch.executorch;
 
-/** Codes representing tensor data types. */
+import org.pytorch.executorch.annotations.Experimental;
+
+/**
+ * Codes representing tensor data types.
+ *
+ * <p>Warning: These APIs are experimental and subject to change without notice
+ */
+@Experimental
 public enum DType {
   // NOTE: "jniCode" must be kept in sync with scalar_type.h.
   // NOTE: Never serialize "jniCode", because it can change between releases.
 
-  /** Code for dtype torch::executor::Byte */
+  /** Code for dtype ScalarType::Byte */
   UINT8(0),
-  /** Code for dtype torch::executor::Char */
+  /** Code for dtype ScalarType::Char */
   INT8(1),
-  /** Code for dtype torch::executor::Short */
+  /** Code for dtype ScalarType::Short */
   INT16(2),
-  /** Code for dtype torch::executor::Int */
+  /** Code for dtype ScalarType::Int */
   INT32(3),
-  /** Code for dtype torch::executor::Long */
+  /** Code for dtype ScalarType::Long */
   INT64(4),
-  /** Code for dtype torch::executor::Half */
+  /** Code for dtype ScalarType::Half */
   HALF(5),
-  /** Code for dtype torch::executor::Float */
+  /** Code for dtype ScalarType::Float */
   FLOAT(6),
-  /** Code for dtype torch::executor::Double */
+  /** Code for dtype ScalarType::Double */
   DOUBLE(7),
-  /** Code for dtype torch::executor::ComplexHalf */
+  /** Code for dtype ScalarType::ComplexHalf */
   COMPLEX_HALF(8),
-  /** Code for dtype torch::executor::ComplexFloat */
+  /** Code for dtype ScalarType::ComplexFloat */
   COMPLEX_FLOAT(9),
-  /** Code for dtype torch::executor::ComplexDouble */
+  /** Code for dtype ScalarType::ComplexDouble */
   COMPLEX_DOUBLE(10),
-  /** Code for dtype torch::executor::Bool */
+  /** Code for dtype ScalarType::Bool */
   BOOL(11),
-  /** Code for dtype torch::executor::QInt8 */
+  /** Code for dtype ScalarType::QInt8 */
   QINT8(12),
-  /** Code for dtype torch::executor::QUInt8 */
+  /** Code for dtype ScalarType::QUInt8 */
   QUINT8(13),
-  /** Code for dtype torch::executor::QInt32 */
+  /** Code for dtype ScalarType::QInt32 */
   QINT32(14),
-  /** Code for dtype torch::executor::BFloat16 */
+  /** Code for dtype ScalarType::BFloat16 */
   BFLOAT16(15),
-  /** Code for dtype torch::executor::QUInt4x2 */
+  /** Code for dtype ScalarType::QUInt4x2 */
   QINT4X2(16),
-  /** Code for dtype torch::executor::QUInt2x4 */
+  /** Code for dtype ScalarType::QUInt2x4 */
   QINT2X4(17),
-  /** Code for dtype torch::executor::Bits1x8 */
+  /** Code for dtype ScalarType::Bits1x8 */
   BITS1X8(18),
-  /** Code for dtype torch::executor::Bits2x4 */
+  /** Code for dtype ScalarType::Bits2x4 */
   BITS2X4(19),
-  /** Code for dtype torch::executor::Bits4x2 */
+  /** Code for dtype ScalarType::Bits4x2 */
   BITS4X2(20),
-  /** Code for dtype torch::executor::Bits8 */
+  /** Code for dtype ScalarType::Bits8 */
   BITS8(21),
-  /** Code for dtype torch::executor::Bits16 */
+  /** Code for dtype ScalarType::Bits16 */
   BITS16(22),
   ;
 
diff --git a/extension/android/src/main/java/org/pytorch/executorch/EValue.java b/extension/android/src/main/java/org/pytorch/executorch/EValue.java
index d9fa2e8b833..0065d808728 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/EValue.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/EValue.java
@@ -11,6 +11,7 @@
 import com.facebook.jni.annotations.DoNotStrip;
 import java.util.Locale;
 import java.util.Optional;
+import org.pytorch.executorch.annotations.Experimental;
 
 /**
  * Java representation of an ExecuTorch value, which is implemented as tagged union that can be one
@@ -27,7 +28,10 @@
  *
  * <p>{@code EValue} objects may retain references to objects passed into their constructors, and
  * may return references to their internal state from {@code toX()}.
+ *
+ * <p>Warning: These APIs are experimental and subject to change without notice
  */
+@Experimental
 @DoNotStrip
 public class EValue {
   private static final int TYPE_CODE_NONE = 0;
diff --git a/extension/android/src/main/java/org/pytorch/executorch/INativePeer.java b/extension/android/src/main/java/org/pytorch/executorch/INativePeer.java
deleted file mode 100644
index 0878f151437..00000000000
--- a/extension/android/src/main/java/org/pytorch/executorch/INativePeer.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package org.pytorch.executorch;
-
-/** Interface for the native peer object for entry points to the Module */
-interface INativePeer {
-  /** Clean up the native resources associated with this instance */
-  void resetNative();
-
-  /** Run a "forward" call with the given inputs */
-  EValue[] forward(EValue... inputs);
-
-  /** Run an arbitrary method on the module */
-  EValue[] execute(String methodName, EValue... inputs);
-
-  /**
-   * Load a method on this module.
-   *
-   * @return the Error code if there was an error loading the method
-   */
-  int loadMethod(String methodName);
-}
diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java
index 33ab928bae0..b30fa2515a9 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java
@@ -9,7 +9,15 @@
 package org.pytorch.executorch;
 
 import com.facebook.jni.annotations.DoNotStrip;
+import org.pytorch.executorch.annotations.Experimental;
 
+/**
+ * Callback interface for Llama model. Users can implement this interface to receive the generated
+ * tokens and statistics.
+ *
+ * <p>Warning: These APIs are experimental and subject to change without notice
+ */
+@Experimental
 public interface LlamaCallback {
   /**
    * Called when a new result is available from JNI. Users will keep getting onResult() invocations
diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
index 3e4b091cfc8..6de26bc7fe8 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
@@ -12,7 +12,15 @@
 import com.facebook.jni.annotations.DoNotStrip;
 import com.facebook.soloader.nativeloader.NativeLoader;
 import com.facebook.soloader.nativeloader.SystemDelegate;
+import org.pytorch.executorch.annotations.Experimental;
 
+/**
+ * LlamaModule is a wrapper around the Executorch Llama model. It provides a simple interface to
+ * generate text from the model.
+ *
+ * <p>Warning: These APIs are experimental and subject to change without notice
+ */
+@Experimental
 public class LlamaModule {
 
   public static final int MODEL_TYPE_TEXT = 1;
@@ -22,11 +30,12 @@ public class LlamaModule {
     if (!NativeLoader.isInitialized()) {
       NativeLoader.init(new SystemDelegate());
     }
-    NativeLoader.loadLibrary("executorch_llama_jni");
+    NativeLoader.loadLibrary("executorch");
   }
 
   private final HybridData mHybridData;
   private static final int DEFAULT_SEQ_LEN = 128;
+  private static final boolean DEFAULT_ECHO = true;
 
   @DoNotStrip
   private static native HybridData initHybrid(
@@ -53,7 +62,7 @@ public void resetNative() {
    * @param llamaCallback callback object to receive results.
    */
   public int generate(String prompt, LlamaCallback llamaCallback) {
-    return generate(prompt, DEFAULT_SEQ_LEN, llamaCallback);
+    return generate(prompt, DEFAULT_SEQ_LEN, llamaCallback, DEFAULT_ECHO);
   }
 
   /**
@@ -64,7 +73,30 @@ public int generate(String prompt, LlamaCallback llamaCallback) {
    * @param llamaCallback callback object to receive results.
    */
   public int generate(String prompt, int seqLen, LlamaCallback llamaCallback) {
-    return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback);
+    return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback, DEFAULT_ECHO);
+  }
+
+  /**
+   * Start generating tokens from the module.
+   *
+   * @param prompt Input prompt
+   * @param llamaCallback callback object to receive results
+   * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
+   */
+  public int generate(String prompt, LlamaCallback llamaCallback, boolean echo) {
+    return generate(null, 0, 0, 0, prompt, DEFAULT_SEQ_LEN, llamaCallback, echo);
+  }
+
+  /**
+   * Start generating tokens from the module.
+   *
+   * @param prompt Input prompt
+   * @param seqLen sequence length
+   * @param llamaCallback callback object to receive results
+   * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
+   */
+  public int generate(String prompt, int seqLen, LlamaCallback llamaCallback, boolean echo) {
+    return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback, echo);
   }
 
   /**
@@ -77,6 +109,7 @@ public int generate(String prompt, int seqLen, LlamaCallback llamaCallback) {
    * @param prompt Input prompt
    * @param seqLen sequence length
    * @param llamaCallback callback object to receive results.
+   * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
    */
   @DoNotStrip
   public native int generate(
@@ -86,7 +119,66 @@ public native int generate(
       int channels,
       String prompt,
       int seqLen,
-      LlamaCallback llamaCallback);
+      LlamaCallback llamaCallback,
+      boolean echo);
+
+  /**
+   * Prefill an LLaVA Module with the given images input.
+   *
+   * @param image Input image as a byte array
+   * @param width Input image width
+   * @param height Input image height
+   * @param channels Input image number of channels
+   * @param startPos The starting position in KV cache of the input in the LLM.
+   * @return The updated starting position in KV cache of the input in the LLM.
+   * @throws RuntimeException if the prefill failed
+   */
+  public long prefillImages(int[] image, int width, int height, int channels, long startPos) {
+    long[] nativeResult = prefillImagesNative(image, width, height, channels, startPos);
+    if (nativeResult[0] != 0) {
+      throw new RuntimeException("Prefill failed with error code: " + nativeResult[0]);
+    }
+    return nativeResult[1];
+  }
+
+  // returns a tuple of (status, updated startPos)
+  private native long[] prefillImagesNative(
+      int[] image, int width, int height, int channels, long startPos);
+
+  /**
+   * Prefill an LLaVA Module with the given text input.
+   *
+   * @param prompt The text prompt to LLaVA.
+   * @param startPos The starting position in KV cache of the input in the LLM. It's passed as
+   *     reference and will be updated inside this function.
+   * @param bos The number of BOS (begin of sequence) token.
+   * @param eos The number of EOS (end of sequence) token.
+   * @return The updated starting position in KV cache of the input in the LLM.
+   * @throws RuntimeException if the prefill failed
+   */
+  public long prefillPrompt(String prompt, long startPos, int bos, int eos) {
+    long[] nativeResult = prefillPromptNative(prompt, startPos, bos, eos);
+    if (nativeResult[0] != 0) {
+      throw new RuntimeException("Prefill failed with error code: " + nativeResult[0]);
+    }
+    return nativeResult[1];
+  }
+
+  // returns a tuple of (status, updated startPos)
+  private native long[] prefillPromptNative(String prompt, long startPos, int bos, int eos);
+
+  /**
+   * Generate tokens from the given prompt, starting from the given position.
+   *
+   * @param prompt The text prompt to LLaVA.
+   * @param seqLen The total sequence length, including the prompt tokens and new tokens.
+   * @param startPos The starting position in KV cache of the input in the LLM.
+   * @param callback callback object to receive results.
+   * @param echo indicate whether to echo the input prompt or not.
+   * @return The error code.
+   */
+  public native int generateFromPos(
+      String prompt, int seqLen, long startPos, LlamaCallback callback, boolean echo);
 
   /** Stop current generate() before it finishes. */
   @DoNotStrip
diff --git a/extension/android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/src/main/java/org/pytorch/executorch/Module.java
index 3d2d2dd86ee..084f1be23f5 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/Module.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/Module.java
@@ -11,8 +11,14 @@
 import com.facebook.soloader.nativeloader.NativeLoader;
 import com.facebook.soloader.nativeloader.SystemDelegate;
 import java.util.Map;
+import org.pytorch.executorch.annotations.Experimental;
 
-/** Java wrapper for ExecuTorch Module. */
+/**
+ * Java wrapper for ExecuTorch Module.
+ *
+ * <p>Warning: These APIs are experimental and subject to change without notice
+ */
+@Experimental
 public class Module {
 
   /** Load mode for the module. Load the whole file as a buffer. */
@@ -27,8 +33,8 @@ public class Module {
   /** Load mode for the module. Use memory locking and ignore errors. */
   public static final int LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS = 3;
 
-  /** Reference to the INativePeer object of this module. */
-  private INativePeer mNativePeer;
+  /** Reference to the NativePeer object of this module. */
+  private NativePeer mNativePeer;
 
   /**
    * Loads a serialized ExecuTorch module from the specified path on the disk. Uses default load
@@ -68,14 +74,16 @@ public static Module load(final String modelPath) {
     return load(modelPath, null);
   }
 
-  Module(INativePeer nativePeer) {
+  Module(NativePeer nativePeer) {
     this.mNativePeer = nativePeer;
   }
 
   /**
    * Runs the 'forward' method of this module with the specified arguments.
    *
-   * @param inputs arguments for the ExecuTorch module's 'forward' method.
+   * @param inputs arguments for the ExecuTorch module's 'forward' method. Note: if method 'forward'
+   *     requires inputs but no inputs are given, the function will not error out, but run 'forward'
+   *     with sample inputs.
    * @return return value from the 'forward' method.
    */
   public EValue[] forward(EValue... inputs) {
diff --git a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java
index a9116dcd842..6d0078b0b62 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java
@@ -12,8 +12,15 @@
 import com.facebook.jni.annotations.DoNotStrip;
 import com.facebook.soloader.nativeloader.NativeLoader;
 import java.util.Map;
+import org.pytorch.executorch.annotations.Experimental;
 
-class NativePeer implements INativePeer {
+/**
+ * Interface for the native peer object for entry points to the Module
+ *
+ * <p>Warning: These APIs are experimental and subject to change without notice
+ */
+@Experimental
+class NativePeer {
   static {
     // Loads libexecutorch.so from jniLibs
     NativeLoader.loadLibrary("executorch");
@@ -29,16 +36,24 @@ private static native HybridData initHybrid(
     mHybridData = initHybrid(moduleAbsolutePath, extraFiles, loadMode);
   }
 
+  /** Clean up the native resources associated with this instance */
   public void resetNative() {
     mHybridData.resetNative();
   }
 
+  /** Run a "forward" call with the given inputs */
   @DoNotStrip
   public native EValue[] forward(EValue... inputs);
 
+  /** Run an arbitrary method on the module */
   @DoNotStrip
   public native EValue[] execute(String methodName, EValue... inputs);
 
+  /**
+   * Load a method on this module.
+   *
+   * @return the Error code if there was an error loading the method
+   */
   @DoNotStrip
   public native int loadMethod(String methodName);
 }
diff --git a/extension/android/src/main/java/org/pytorch/executorch/Tensor.java b/extension/android/src/main/java/org/pytorch/executorch/Tensor.java
index 0c478b89b38..685110ff9ae 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/Tensor.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/Tensor.java
@@ -19,6 +19,7 @@
 import java.nio.LongBuffer;
 import java.util.Arrays;
 import java.util.Locale;
+import org.pytorch.executorch.annotations.Experimental;
 
 /**
  * Representation of an ExecuTorch Tensor. Behavior is similar to PyTorch's tensor objects.
@@ -36,7 +37,10 @@
  * between {@link Module} calls to avoid reallocation. Data retrieved from {@code Tensor} objects
  * may be copied or may be a reference to the {@code Tensor}'s internal data buffer. {@code shape}
  * is always copied.
+ *
+ * <p>Warning: These APIs are experimental and subject to change without notice
  */
+@Experimental
 public abstract class Tensor {
   private static final String ERROR_MSG_DATA_BUFFER_NOT_NULL = "Data buffer must be not null";
   private static final String ERROR_MSG_DATA_ARRAY_NOT_NULL = "Data array must be not null";
diff --git a/extension/android/src/main/java/org/pytorch/executorch/annotations/Experimental.java b/extension/android/src/main/java/org/pytorch/executorch/annotations/Experimental.java
new file mode 100644
index 00000000000..f5f36fc56da
--- /dev/null
+++ b/extension/android/src/main/java/org/pytorch/executorch/annotations/Experimental.java
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.executorch.annotations;
+
+/**
+ * This annotation indicates that an API is experimental and may change or be removed at any time.
+ * It does not provide any guarantees for API stability or backward-compatibility.
+ *
+ * <p>This status is not permanent, and APIs marked with this annotation will need to be either made
+ * more robust or removed in the future.
+ */
+public @interface Experimental {}
diff --git a/extension/apple/Benchmark/App/App.entitlements b/extension/apple/Benchmark/App/App.entitlements
new file mode 100644
index 00000000000..e461e7f22f6
--- /dev/null
+++ b/extension/apple/Benchmark/App/App.entitlements
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>com.apple.security.app-sandbox</key>
+	<true/>
+	<key>com.apple.security.files.user-selected.read-only</key>
+	<true/>
+	<key>com.apple.developer.kernel.increased-memory-limit</key>
+	<true/>
+</dict>
+</plist>
diff --git a/extension/apple/Benchmark/App/App.swift b/extension/apple/Benchmark/App/App.swift
new file mode 100644
index 00000000000..30fbd221dc0
--- /dev/null
+++ b/extension/apple/Benchmark/App/App.swift
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+import SwiftUI
+
+@main
+struct BenchmarkApp: App {
+  var body: some Scene {
+    WindowGroup {}
+  }
+}
diff --git a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
new file mode 100644
index 00000000000..fb7146aa937
--- /dev/null
+++ b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
@@ -0,0 +1,543 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 56;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		03B2D3682C8A515A0046936E /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3672C8A515A0046936E /* App.swift */; };
+		03B2D37A2C8A515C0046936E /* Tests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3792C8A515C0046936E /* Tests.mm */; };
+		03C7FA382C8AA3EC00E6E9AE /* Models in Resources */ = {isa = PBXBuildFile; fileRef = 03C7FA322C8AA24200E6E9AE /* Models */; };
+		03DD00A92C8FE44600FE4619 /* backend_coreml.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */; };
+		03DD00AA2C8FE44600FE4619 /* kernels_custom.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */; };
+		03DD00AF2C8FE44600FE4619 /* kernels_portable.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */; };
+		03DD00B02C8FE44600FE4619 /* kernels_optimized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */; };
+		03DD00B12C8FE44600FE4619 /* backend_xnnpack.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */; };
+		03DD00B22C8FE44600FE4619 /* backend_mps.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */; };
+		03DD00B32C8FE44600FE4619 /* executorch.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A32C8FE44600FE4619 /* executorch.xcframework */; settings = {ATTRIBUTES = (Required, ); }; };
+		03DD00B52C8FE44600FE4619 /* kernels_quantized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */; };
+		03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */; };
+		03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */; };
+		03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */; };
+		03ED6D152C8AAFFF00F2D6EE /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D142C8AAFFF00F2D6EE /* Metal.framework */; };
+		03ED6D172C8AB00500F2D6EE /* CoreML.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D162C8AB00500F2D6EE /* CoreML.framework */; };
+		03ED6D192C8AB00A00F2D6EE /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */; };
+		8493389C2C9918950071ABAD /* UIKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8493389B2C9918950071ABAD /* UIKit.framework */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXContainerItemProxy section */
+		03B2D3762C8A515C0046936E /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 03B2D35C2C8A515A0046936E /* Project object */;
+			proxyType = 1;
+			remoteGlobalIDString = 03B2D3632C8A515A0046936E;
+			remoteInfo = Benchmark;
+		};
+/* End PBXContainerItemProxy section */
+
+/* Begin PBXFileReference section */
+		037C96A02C8A570B00B3DF38 /* Tests.xctestplan */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = Tests.xctestplan; sourceTree = "<group>"; };
+		03B019502C8A80D30044D558 /* Tests.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = Tests.xcconfig; sourceTree = "<group>"; };
+		03B2D3642C8A515A0046936E /* Benchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Benchmark.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		03B2D3672C8A515A0046936E /* App.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = App.swift; sourceTree = "<group>"; };
+		03B2D36D2C8A515B0046936E /* App.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = App.entitlements; sourceTree = "<group>"; };
+		03B2D3752C8A515C0046936E /* Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
+		03B2D3792C8A515C0046936E /* Tests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = Tests.mm; sourceTree = "<group>"; };
+		03C7FA322C8AA24200E6E9AE /* Models */ = {isa = PBXFileReference; lastKnownFileType = folder; path = Models; sourceTree = SOURCE_ROOT; };
+		03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_coreml.xcframework; path = Frameworks/backend_coreml.xcframework; sourceTree = "<group>"; };
+		03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_custom.xcframework; path = Frameworks/kernels_custom.xcframework; sourceTree = "<group>"; };
+		03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_portable.xcframework; path = Frameworks/kernels_portable.xcframework; sourceTree = "<group>"; };
+		03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_optimized.xcframework; path = Frameworks/kernels_optimized.xcframework; sourceTree = "<group>"; };
+		03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_xnnpack.xcframework; path = Frameworks/backend_xnnpack.xcframework; sourceTree = "<group>"; };
+		03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_mps.xcframework; path = Frameworks/backend_mps.xcframework; sourceTree = "<group>"; };
+		03DD00A32C8FE44600FE4619 /* executorch.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = executorch.xcframework; path = Frameworks/executorch.xcframework; sourceTree = "<group>"; };
+		03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_quantized.xcframework; path = Frameworks/kernels_quantized.xcframework; sourceTree = "<group>"; };
+		03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libsqlite3.0.tbd; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/usr/lib/libsqlite3.0.tbd; sourceTree = DEVELOPER_DIR; };
+		03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShadersGraph.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShadersGraph.framework; sourceTree = DEVELOPER_DIR; };
+		03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShaders.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShaders.framework; sourceTree = DEVELOPER_DIR; };
+		03ED6D142C8AAFFF00F2D6EE /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/Metal.framework; sourceTree = DEVELOPER_DIR; };
+		03ED6D162C8AB00500F2D6EE /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/CoreML.framework; sourceTree = DEVELOPER_DIR; };
+		03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/Accelerate.framework; sourceTree = DEVELOPER_DIR; };
+		8493389B2C9918950071ABAD /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/UIKit.framework; sourceTree = DEVELOPER_DIR; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		03B2D3612C8A515A0046936E /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		03B2D3722C8A515C0046936E /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				03ED6D192C8AB00A00F2D6EE /* Accelerate.framework in Frameworks */,
+				03ED6D172C8AB00500F2D6EE /* CoreML.framework in Frameworks */,
+				03ED6D152C8AAFFF00F2D6EE /* Metal.framework in Frameworks */,
+				03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */,
+				8493389C2C9918950071ABAD /* UIKit.framework in Frameworks */,
+				03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */,
+				03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */,
+				03DD00A92C8FE44600FE4619 /* backend_coreml.xcframework in Frameworks */,
+				03DD00B22C8FE44600FE4619 /* backend_mps.xcframework in Frameworks */,
+				03DD00B12C8FE44600FE4619 /* backend_xnnpack.xcframework in Frameworks */,
+				03DD00B32C8FE44600FE4619 /* executorch.xcframework in Frameworks */,
+				03DD00AA2C8FE44600FE4619 /* kernels_custom.xcframework in Frameworks */,
+				03DD00B02C8FE44600FE4619 /* kernels_optimized.xcframework in Frameworks */,
+				03DD00AF2C8FE44600FE4619 /* kernels_portable.xcframework in Frameworks */,
+				03DD00B52C8FE44600FE4619 /* kernels_quantized.xcframework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		03B2D35B2C8A515A0046936E = {
+			isa = PBXGroup;
+			children = (
+				03B2D3662C8A515A0046936E /* App */,
+				03ED6CEB2C8AAF5300F2D6EE /* Frameworks */,
+				03C7FA322C8AA24200E6E9AE /* Models */,
+				03B2D3782C8A515C0046936E /* Tests */,
+				03B2D3652C8A515A0046936E /* Products */,
+			);
+			sourceTree = "<group>";
+		};
+		03B2D3652C8A515A0046936E /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				03B2D3642C8A515A0046936E /* Benchmark.app */,
+				03B2D3752C8A515C0046936E /* Tests.xctest */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		03B2D3662C8A515A0046936E /* App */ = {
+			isa = PBXGroup;
+			children = (
+				03B2D3672C8A515A0046936E /* App.swift */,
+				03B2D36D2C8A515B0046936E /* App.entitlements */,
+			);
+			path = App;
+			sourceTree = SOURCE_ROOT;
+		};
+		03B2D3782C8A515C0046936E /* Tests */ = {
+			isa = PBXGroup;
+			children = (
+				03B2D3792C8A515C0046936E /* Tests.mm */,
+				03B019502C8A80D30044D558 /* Tests.xcconfig */,
+				037C96A02C8A570B00B3DF38 /* Tests.xctestplan */,
+			);
+			path = Tests;
+			sourceTree = SOURCE_ROOT;
+		};
+		03ED6CEB2C8AAF5300F2D6EE /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				8493389B2C9918950071ABAD /* UIKit.framework */,
+				03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */,
+				03ED6D162C8AB00500F2D6EE /* CoreML.framework */,
+				03ED6D142C8AAFFF00F2D6EE /* Metal.framework */,
+				03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */,
+				03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */,
+				03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */,
+				03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */,
+				03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */,
+				03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */,
+				03DD00A32C8FE44600FE4619 /* executorch.xcframework */,
+				03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */,
+				03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */,
+				03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */,
+				03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */,
+			);
+			name = Frameworks;
+			sourceTree = SOURCE_ROOT;
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		03B2D3632C8A515A0046936E /* App */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 03B2D3892C8A515C0046936E /* Build configuration list for PBXNativeTarget "App" */;
+			buildPhases = (
+				03B2D3602C8A515A0046936E /* Sources */,
+				03B2D3612C8A515A0046936E /* Frameworks */,
+				03B2D3622C8A515A0046936E /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = App;
+			productName = Benchmark;
+			productReference = 03B2D3642C8A515A0046936E /* Benchmark.app */;
+			productType = "com.apple.product-type.application";
+		};
+		03B2D3742C8A515C0046936E /* Tests */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 03B2D38C2C8A515C0046936E /* Build configuration list for PBXNativeTarget "Tests" */;
+			buildPhases = (
+				03B2D3712C8A515C0046936E /* Sources */,
+				03B2D3722C8A515C0046936E /* Frameworks */,
+				03B2D3732C8A515C0046936E /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+				03B2D3772C8A515C0046936E /* PBXTargetDependency */,
+			);
+			name = Tests;
+			productName = BenchmarkTests;
+			productReference = 03B2D3752C8A515C0046936E /* Tests.xctest */;
+			productType = "com.apple.product-type.bundle.unit-test";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		03B2D35C2C8A515A0046936E /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				BuildIndependentTargetsInParallel = 1;
+				LastSwiftUpdateCheck = 1540;
+				LastUpgradeCheck = 1540;
+				TargetAttributes = {
+					03B2D3632C8A515A0046936E = {
+						CreatedOnToolsVersion = 15.4;
+					};
+					03B2D3742C8A515C0046936E = {
+						CreatedOnToolsVersion = 15.4;
+						TestTargetID = 03B2D3632C8A515A0046936E;
+					};
+				};
+			};
+			buildConfigurationList = 03B2D35F2C8A515A0046936E /* Build configuration list for PBXProject "Benchmark" */;
+			compatibilityVersion = "Xcode 14.0";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 03B2D35B2C8A515A0046936E;
+			productRefGroup = 03B2D3652C8A515A0046936E /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				03B2D3632C8A515A0046936E /* App */,
+				03B2D3742C8A515C0046936E /* Tests */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		03B2D3622C8A515A0046936E /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		03B2D3732C8A515C0046936E /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				03C7FA382C8AA3EC00E6E9AE /* Models in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		03B2D3602C8A515A0046936E /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				03B2D3682C8A515A0046936E /* App.swift in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		03B2D3712C8A515C0046936E /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				03B2D37A2C8A515C0046936E /* Tests.mm in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXTargetDependency section */
+		03B2D3772C8A515C0046936E /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			target = 03B2D3632C8A515A0046936E /* App */;
+			targetProxy = 03B2D3762C8A515C0046936E /* PBXContainerItemProxy */;
+		};
+/* End PBXTargetDependency section */
+
+/* Begin XCBuildConfiguration section */
+		03B2D3872C8A515C0046936E /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "c++17";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_FAST_MATH = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+			};
+			name = Debug;
+		};
+		03B2D3882C8A515C0046936E /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "c++17";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_FAST_MATH = YES;
+				SWIFT_COMPILATION_MODE = wholemodule;
+			};
+			name = Release;
+		};
+		03B2D38A2C8A515C0046936E /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CODE_SIGN_ENTITLEMENTS = App/App.entitlements;
+				"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_TEAM = "";
+				ENABLE_PREVIEWS = YES;
+				GENERATE_INFOPLIST_FILE = YES;
+				"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphoneos*]" = YES;
+				"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphonesimulator*]" = YES;
+				"INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphoneos*]" = YES;
+				"INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphonesimulator*]" = YES;
+				"INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphoneos*]" = YES;
+				"INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphonesimulator*]" = YES;
+				"INFOPLIST_KEY_UIStatusBarStyle[sdk=iphoneos*]" = UIStatusBarStyleDefault;
+				"INFOPLIST_KEY_UIStatusBarStyle[sdk=iphonesimulator*]" = UIStatusBarStyleDefault;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks";
+				"LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks";
+				MACOSX_DEPLOYMENT_TARGET = 11.0;
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.Benchmark;
+				PRODUCT_NAME = Benchmark;
+				SDKROOT = auto;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
+				SUPPORTS_MACCATALYST = NO;
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		03B2D38B2C8A515C0046936E /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CODE_SIGN_ENTITLEMENTS = App/App.entitlements;
+				"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_TEAM = "";
+				ENABLE_PREVIEWS = YES;
+				GENERATE_INFOPLIST_FILE = YES;
+				"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphoneos*]" = YES;
+				"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphonesimulator*]" = YES;
+				"INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphoneos*]" = YES;
+				"INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphonesimulator*]" = YES;
+				"INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphoneos*]" = YES;
+				"INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphonesimulator*]" = YES;
+				"INFOPLIST_KEY_UIStatusBarStyle[sdk=iphoneos*]" = UIStatusBarStyleDefault;
+				"INFOPLIST_KEY_UIStatusBarStyle[sdk=iphonesimulator*]" = UIStatusBarStyleDefault;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks";
+				"LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks";
+				MACOSX_DEPLOYMENT_TARGET = 11.0;
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.Benchmark;
+				PRODUCT_NAME = Benchmark;
+				SDKROOT = auto;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
+				SUPPORTS_MACCATALYST = NO;
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+		03B2D38D2C8A515C0046936E /* Debug */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 03B019502C8A80D30044D558 /* Tests.xcconfig */;
+			buildSettings = {
+				ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES;
+				BUNDLE_LOADER = "$(TEST_HOST)";
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_TEAM = "";
+				GENERATE_INFOPLIST_FILE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				MACOSX_DEPLOYMENT_TARGET = 10.15;
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SDKROOT = auto;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
+				SUPPORTS_MACCATALYST = NO;
+				SWIFT_EMIT_LOC_STRINGS = NO;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+				TEST_HOST = "$(BUILT_PRODUCTS_DIR)/Benchmark.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/Benchmark";
+			};
+			name = Debug;
+		};
+		03B2D38E2C8A515C0046936E /* Release */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 03B019502C8A80D30044D558 /* Tests.xcconfig */;
+			buildSettings = {
+				ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES;
+				BUNDLE_LOADER = "$(TEST_HOST)";
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_TEAM = "";
+				GENERATE_INFOPLIST_FILE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				MACOSX_DEPLOYMENT_TARGET = 10.15;
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SDKROOT = auto;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
+				SUPPORTS_MACCATALYST = NO;
+				SWIFT_EMIT_LOC_STRINGS = NO;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+				TEST_HOST = "$(BUILT_PRODUCTS_DIR)/Benchmark.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/Benchmark";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		03B2D35F2C8A515A0046936E /* Build configuration list for PBXProject "Benchmark" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				03B2D3872C8A515C0046936E /* Debug */,
+				03B2D3882C8A515C0046936E /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		03B2D3892C8A515C0046936E /* Build configuration list for PBXNativeTarget "App" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				03B2D38A2C8A515C0046936E /* Debug */,
+				03B2D38B2C8A515C0046936E /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		03B2D38C2C8A515C0046936E /* Build configuration list for PBXNativeTarget "Tests" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				03B2D38D2C8A515C0046936E /* Debug */,
+				03B2D38E2C8A515C0046936E /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 03B2D35C2C8A515A0046936E /* Project object */;
+}
diff --git a/extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme b/extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme
new file mode 100644
index 00000000000..ebfe1e5fd35
--- /dev/null
+++ b/extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme
@@ -0,0 +1,107 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Scheme
+   LastUpgradeVersion = "1540"
+   version = "1.7">
+   <BuildAction
+      parallelizeBuildables = "YES"
+      buildImplicitDependencies = "YES"
+      buildArchitectures = "Automatic">
+      <BuildActionEntries>
+         <BuildActionEntry
+            buildForTesting = "YES"
+            buildForRunning = "YES"
+            buildForProfiling = "YES"
+            buildForArchiving = "YES"
+            buildForAnalyzing = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "03B2D3632C8A515A0046936E"
+               BuildableName = "Benchmark.app"
+               BlueprintName = "App"
+               ReferencedContainer = "container:Benchmark.xcodeproj">
+            </BuildableReference>
+         </BuildActionEntry>
+      </BuildActionEntries>
+   </BuildAction>
+   <TestAction
+      buildConfiguration = "Release"
+      selectedDebuggerIdentifier = ""
+      selectedLauncherIdentifier = "Xcode.IDEFoundation.Launcher.PosixSpawn"
+      shouldUseLaunchSchemeArgsEnv = "YES">
+      <TestPlans>
+         <TestPlanReference
+            reference = "container:Tests/Tests.xctestplan"
+            default = "YES">
+         </TestPlanReference>
+      </TestPlans>
+      <Testables>
+         <TestableReference
+            skipped = "NO"
+            parallelizable = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "03B2D3742C8A515C0046936E"
+               BuildableName = "Tests.xctest"
+               BlueprintName = "Tests"
+               ReferencedContainer = "container:Benchmark.xcodeproj">
+            </BuildableReference>
+         </TestableReference>
+         <TestableReference
+            skipped = "NO"
+            parallelizable = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "03B2D37E2C8A515C0046936E"
+               BuildableName = "BenchmarkUITests.xctest"
+               BlueprintName = "BenchmarkUITests"
+               ReferencedContainer = "container:Benchmark.xcodeproj">
+            </BuildableReference>
+         </TestableReference>
+      </Testables>
+   </TestAction>
+   <LaunchAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      launchStyle = "0"
+      useCustomWorkingDirectory = "NO"
+      ignoresPersistentStateOnLaunch = "NO"
+      debugDocumentVersioning = "YES"
+      debugServiceExtension = "internal"
+      allowLocationSimulation = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "03B2D3632C8A515A0046936E"
+            BuildableName = "Benchmark.app"
+            BlueprintName = "App"
+            ReferencedContainer = "container:Benchmark.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </LaunchAction>
+   <ProfileAction
+      buildConfiguration = "Release"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      savedToolIdentifier = ""
+      useCustomWorkingDirectory = "NO"
+      debugDocumentVersioning = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "03B2D3632C8A515A0046936E"
+            BuildableName = "Benchmark.app"
+            BlueprintName = "App"
+            ReferencedContainer = "container:Benchmark.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </ProfileAction>
+   <AnalyzeAction
+      buildConfiguration = "Debug">
+   </AnalyzeAction>
+   <ArchiveAction
+      buildConfiguration = "Release"
+      revealArchiveInOrganizer = "YES">
+   </ArchiveAction>
+</Scheme>
diff --git a/extension/apple/Benchmark/Tests/Tests.mm b/extension/apple/Benchmark/Tests/Tests.mm
new file mode 100644
index 00000000000..74388b49d54
--- /dev/null
+++ b/extension/apple/Benchmark/Tests/Tests.mm
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <XCTest/XCTest.h>
+
+#import <UIKit/UIDevice.h>
+#import <objc/runtime.h>
+#import <sys/utsname.h>
+
+#import <executorch/extension/module/module.h>
+#import <executorch/extension/tensor/tensor.h>
+
+using namespace ::executorch::extension;
+using namespace ::executorch::runtime;
+
+@interface Tests : XCTestCase
+
+@end
+
+@implementation Tests
+
++ (NSString *)getDeviceInfo {
+  NSString *device = [[UIDevice currentDevice] model];
+
+  // i.e. iPhone15
+  struct utsname systemInfo;
+  uname(&systemInfo);
+  NSString *name = [NSString stringWithCString:systemInfo.machine
+                                      encoding:NSUTF8StringEncoding];
+
+  NSString *systemName = [[UIDevice currentDevice] systemName];
+  NSString *systemVersion = [[UIDevice currentDevice] systemVersion];
+  return [NSString
+      stringWithFormat:@"%@_%@_%@_%@", device, name, systemName, systemVersion];
+}
+
++ (void)initialize {
+  if (self != [self class]) {
+    return;
+  }
+  NSString *deviceInfo = [Tests getDeviceInfo];
+
+  for (NSBundle *bundle in @[
+         [NSBundle mainBundle],
+         [NSBundle bundleForClass:[self class]],
+       ]) {
+    for (NSString *directory in @[
+           @"Models",
+           @"aatp/data",
+         ]) {
+      NSString *directoryPath =
+          [bundle.resourcePath stringByAppendingPathComponent:directory];
+      NSArray *filePaths =
+          [NSFileManager.defaultManager contentsOfDirectoryAtPath:directoryPath
+                                                            error:nil];
+      for (NSString *filePath in filePaths) {
+        if (![filePath hasSuffix:@".pte"]) {
+          continue;
+        }
+        NSString *modelPath =
+            [directoryPath stringByAppendingPathComponent:filePath];
+        NSString *directoryName =
+            [directory stringByReplacingOccurrencesOfString:@"/"
+                                                 withString:@"_"]
+                .lowercaseString;
+        NSString *modelName =
+            modelPath.lastPathComponent.stringByDeletingPathExtension;
+
+        SEL testLoadSelector = NSSelectorFromString(
+            [NSString stringWithFormat:@"test_load_%@_%@_%@",
+                                       directoryName,
+                                       modelName,
+                                       deviceInfo]);
+        IMP testLoadImplementation = imp_implementationWithBlock(^(id _self) {
+          auto __block module = std::make_unique<Module>(modelPath.UTF8String);
+          [_self measureWithMetrics:@[
+            [XCTClockMetric new],
+            [XCTMemoryMetric new],
+          ]
+                            options:XCTMeasureOptions.defaultOptions
+                              block:^{
+                                XCTAssertEqual(module->load_method("forward"),
+                                               Error::Ok);
+                              }];
+        });
+        class_addMethod(
+            [self class], testLoadSelector, testLoadImplementation, "v@:");
+
+        SEL testForwardSelector = NSSelectorFromString(
+            [NSString stringWithFormat:@"test_forward_%@_%@_%@",
+                                       directoryName,
+                                       modelName,
+                                       deviceInfo]);
+        IMP testForwardImplementation = imp_implementationWithBlock(^(
+            id _self) {
+          auto __block module = std::make_unique<Module>(modelPath.UTF8String);
+          XCTAssertEqual(module->load_method("forward"), Error::Ok);
+
+          const auto method_meta = module->method_meta("forward");
+          XCTAssertEqual(method_meta.error(), Error::Ok);
+
+          const auto num_inputs = method_meta->num_inputs();
+          XCTAssertGreaterThan(num_inputs, 0);
+
+          std::vector<TensorPtr> __block tensors;
+          tensors.reserve(num_inputs);
+          std::vector<EValue> __block inputs;
+          inputs.reserve(num_inputs);
+
+          for (auto index = 0; index < num_inputs; ++index) {
+            const auto input_tag = method_meta->input_tag(index);
+            XCTAssertEqual(input_tag.error(), Error::Ok);
+
+            switch (*input_tag) {
+            case Tag::Tensor: {
+              const auto tensor_meta = method_meta->input_tensor_meta(index);
+              XCTAssertEqual(tensor_meta.error(), Error::Ok);
+
+              const auto sizes = tensor_meta->sizes();
+              tensors.emplace_back(ones({sizes.begin(), sizes.end()},
+                                        tensor_meta->scalar_type()));
+              inputs.emplace_back(tensors.back());
+            } break;
+            default:
+              XCTFail("Unsupported tag %i at input %d", *input_tag, index);
+            }
+          }
+          [_self measureWithMetrics:@[
+            [XCTClockMetric new],
+            [XCTMemoryMetric new],
+          ]
+                            options:XCTMeasureOptions.defaultOptions
+                              block:^{
+                                XCTAssertEqual(module->forward(inputs).error(),
+                                               Error::Ok);
+                              }];
+        });
+        class_addMethod([self class],
+                        testForwardSelector,
+                        testForwardImplementation,
+                        "v@:");
+      }
+    }
+  }
+}
+
+@end
diff --git a/extension/apple/Benchmark/Tests/Tests.xcconfig b/extension/apple/Benchmark/Tests/Tests.xcconfig
new file mode 100644
index 00000000000..838cc61a43d
--- /dev/null
+++ b/extension/apple/Benchmark/Tests/Tests.xcconfig
@@ -0,0 +1,26 @@
+OTHER_LDFLAGS[sdk=iphonesimulator*] = $(inherited) \
+-force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-simulator-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-release.a
+
+OTHER_LDFLAGS[sdk=iphoneos*] = $(inherited) \
+-force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-ios-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-ios-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-ios-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-ios-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-ios-release.a
+
+OTHER_LDFLAGS[sdk=macos*] = $(inherited) \
+-force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-macos-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-macos-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-macos-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-macos-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-macos-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-macos-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-macos-release.a
diff --git a/extension/apple/Benchmark/Tests/Tests.xctestplan b/extension/apple/Benchmark/Tests/Tests.xctestplan
new file mode 100644
index 00000000000..025f50f1942
--- /dev/null
+++ b/extension/apple/Benchmark/Tests/Tests.xctestplan
@@ -0,0 +1,28 @@
+{
+  "configurations" : [
+    {
+      "id" : "0430A5ED-FD8D-444E-9933-740E01CCD53C",
+      "name" : "Test Scheme Action",
+      "options" : {
+
+      }
+    }
+  ],
+  "defaultOptions" : {
+    "targetForVariableExpansion" : {
+      "containerPath" : "container:Benchmark.xcodeproj",
+      "identifier" : "03B2D3632C8A515A0046936E",
+      "name" : "App"
+    }
+  },
+  "testTargets" : [
+    {
+      "target" : {
+        "containerPath" : "container:Benchmark.xcodeproj",
+        "identifier" : "03B2D3742C8A515C0046936E",
+        "name" : "Tests"
+      }
+    }
+  ],
+  "version" : 1
+}
diff --git a/extension/aten_util/aten_bridge.cpp b/extension/aten_util/aten_bridge.cpp
index 3916f7ed42c..fc167dd71e8 100644
--- a/extension/aten_util/aten_bridge.cpp
+++ b/extension/aten_util/aten_bridge.cpp
@@ -11,8 +11,8 @@
 #include <executorch/runtime/platform/assert.h>
 #include <cstring>
 
-namespace torch {
-namespace util {
+namespace executorch {
+namespace extension {
 
 namespace {
 void check_tensor_meta(const at::Tensor& a, const exec_aten::Tensor& b) {
@@ -55,14 +55,15 @@ ET_CHECK_MSG(
   }
   // check dtype
   ET_CHECK_MSG(
-      b.scalar_type() == torchToExecuTorchScalarType(a.options().dtype()),
+      b.scalar_type() == torch_to_executorch_scalar_type(a.options().dtype()),
       "dtypes dont match a %hhd vs. b %hhd",
-      torchToExecuTorchScalarType(a.options().dtype()),
+      torch_to_executorch_scalar_type(a.options().dtype()),
       b.scalar_type());
 }
 } // namespace
 
-torch::executor::ScalarType torchToExecuTorchScalarType(caffe2::TypeMeta type) {
+torch::executor::ScalarType torch_to_executorch_scalar_type(
+    caffe2::TypeMeta type) {
   switch (c10::typeMetaToScalarType(type)) {
     case c10::ScalarType::Byte:
       return torch::executor::ScalarType::Byte;
@@ -72,6 +73,8 @@ torch::executor::ScalarType torchToExecuTorchScalarType(caffe2::TypeMeta type) {
       return torch::executor::ScalarType::Short;
     case c10::ScalarType::Half:
       return torch::executor::ScalarType::Half;
+    case c10::ScalarType::BFloat16:
+      return torch::executor::ScalarType::BFloat16;
     case c10::ScalarType::Int:
       return torch::executor::ScalarType::Int;
     case c10::ScalarType::Float:
@@ -91,7 +94,8 @@ torch::executor::ScalarType torchToExecuTorchScalarType(caffe2::TypeMeta type) {
   }
 }
 
-c10::ScalarType execuTorchtoTorchScalarType(torch::executor::ScalarType type) {
+c10::ScalarType executorch_to_torch_scalar_type(
+    torch::executor::ScalarType type) {
   switch (type) {
     case torch::executor::ScalarType::Byte:
       return c10::ScalarType::Byte;
@@ -101,6 +105,8 @@ c10::ScalarType execuTorchtoTorchScalarType(torch::executor::ScalarType type) {
       return c10::ScalarType::Short;
     case torch::executor::ScalarType::Half:
       return c10::ScalarType::Half;
+    case torch::executor::ScalarType::BFloat16:
+      return c10::ScalarType::BFloat16;
     case torch::executor::ScalarType::Int:
       return c10::ScalarType::Int;
     case torch::executor::ScalarType::Float:
@@ -147,7 +153,8 @@ void alias_etensor_to_attensor(
 }
 
 at::Tensor alias_attensor_to_etensor(const torch::executor::Tensor& etensor) {
-  c10::ScalarType dtype = execuTorchtoTorchScalarType(etensor.scalar_type());
+  c10::ScalarType dtype =
+      executorch_to_torch_scalar_type(etensor.scalar_type());
   std::vector<int64_t> at_tensor_sizes(
       etensor.sizes().begin(), etensor.sizes().end());
   std::vector<int64_t> at_tensor_strides(
@@ -162,5 +169,6 @@ at::Tensor alias_attensor_to_etensor(const torch::executor::Tensor& etensor) {
   check_tensor_meta(t, etensor);
   return t;
 }
-} // namespace util
-} // namespace torch
+
+} // namespace extension
+} // namespace executorch
diff --git a/extension/aten_util/aten_bridge.h b/extension/aten_util/aten_bridge.h
index a01d9bc26cd..0d6b697463c 100644
--- a/extension/aten_util/aten_bridge.h
+++ b/extension/aten_util/aten_bridge.h
@@ -18,12 +18,14 @@
 #include <memory>
 #include <vector>
 
-namespace torch {
-namespace util {
+namespace executorch {
+namespace extension {
 
-torch::executor::ScalarType torchToExecuTorchScalarType(caffe2::TypeMeta type);
+torch::executor::ScalarType torch_to_executorch_scalar_type(
+    caffe2::TypeMeta type);
 
-c10::ScalarType execuTorchtoTorchScalarType(torch::executor::ScalarType type);
+c10::ScalarType executorch_to_torch_scalar_type(
+    torch::executor::ScalarType type);
 
 /*
  * @param[in] aten_tensor Input at::Tensor
@@ -45,5 +47,37 @@ void alias_etensor_to_attensor(at::Tensor& at, torch::executor::Tensor& et);
  * cloned.
  */
 at::Tensor alias_attensor_to_etensor(const torch::executor::Tensor& et);
+
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace util {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::alias_attensor_to_etensor;
+using ::executorch::extension::alias_etensor_to_attensor;
+inline torch::executor::ScalarType torchToExecuTorchScalarType(
+    caffe2::TypeMeta type) {
+  return ::executorch::extension::torch_to_executorch_scalar_type(type);
+}
+inline c10::ScalarType execuTorchtoTorchScalarType(
+    torch::executor::ScalarType type) {
+  return ::executorch::extension::executorch_to_torch_scalar_type(type);
+}
+} // namespace util
+} // namespace executor
+} // namespace torch
+
+// Some users refer to these as `torch::util::`.
+namespace torch {
+namespace util {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::torch::executor::util::alias_attensor_to_etensor;
+using ::torch::executor::util::alias_etensor_to_attensor;
+using ::torch::executor::util::execuTorchtoTorchScalarType;
+using ::torch::executor::util::torchToExecuTorchScalarType;
 } // namespace util
 } // namespace torch
diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h
index 1e6ab069efb..d7f2906944c 100644
--- a/extension/aten_util/make_aten_functor_from_et_functor.h
+++ b/extension/aten_util/make_aten_functor_from_et_functor.h
@@ -20,12 +20,13 @@
 #endif
 #include <ATen/native/Resize.h>
 #include <executorch/extension/kernel_util/type_list.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/evalue.h>
-#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <torch/torch.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+namespace internal {
 
 // Map types from ETen to ATen.
 // This is used to convert ETen arguments into ATen.
@@ -104,31 +105,12 @@ struct type_convert<
             typename remove_const_ref<ETensor>::type,
             torch::executor::Tensor>>>
     final {
-  explicit type_convert(ATensor value) : value_(value) {
-    auto sizes = std::make_shared<std::vector<Tensor::SizesType>>(
-        value_.sizes().begin(), value_.sizes().end());
-    const ssize_t dim = sizes->size();
-    auto dim_order = std::make_shared<std::vector<Tensor::DimOrderType>>(dim);
-    auto strides = std::make_shared<std::vector<Tensor::StridesType>>(dim);
-
-    std::iota(dim_order->begin(), dim_order->end(), 0);
-    dim_order_to_stride_nocheck(
-        sizes->data(), dim_order->data(), dim, strides->data());
-
-    auto tensor_impl = std::make_shared<TensorImpl>(
-        static_cast<torch::executor::ScalarType>(value_.scalar_type()),
-        sizes->size(),
-        sizes->data(),
-        value_.mutable_data_ptr(),
-        dim_order->data(),
-        strides->data());
-
-    converted_ = std::unique_ptr<Tensor, std::function<void(Tensor*)>>(
-        new Tensor(tensor_impl.get()),
-        [sizes, dim_order, strides, tensor_impl](Tensor* pointer) {
-          delete pointer;
-        });
-  }
+  explicit type_convert(ATensor value)
+      : value_(value),
+        converted_(from_blob(
+            value_.mutable_data_ptr(),
+            {value_.sizes().begin(), value_.sizes().end()},
+            ::torch::executor::ScalarType(value_.scalar_type()))) {}
 
   ETensor call() {
     return *converted_;
@@ -136,7 +118,7 @@ struct type_convert<
 
  private:
   ATensor value_;
-  std::unique_ptr<Tensor, std::function<void(Tensor*)>> converted_;
+  TensorPtr converted_;
 };
 
 // Tensors: ETen to ATen.
@@ -148,15 +130,14 @@ struct type_convert<
         std::is_same_v<typename remove_const_ref<ATensor>::type, at::Tensor> &&
         std::is_same_v<
             typename remove_const_ref<ETensor>::type,
-            torch::executor::Tensor>>>
+            ::torch::executor::Tensor>>>
     final {
   explicit type_convert(ETensor value)
-      : value_(value), sizes_(value_.sizes().begin(), value_.sizes().end()) {
-    converted_ = at::from_blob(
-        value_.mutable_data_ptr(),
-        sizes_,
-        static_cast<c10::ScalarType>(value_.scalar_type()));
-  }
+      : value_(value),
+        converted_(at::from_blob(
+            value_.mutable_data_ptr(),
+            std::vector<int64_t>{value_.sizes().begin(), value_.sizes().end()},
+            c10::ScalarType(value_.scalar_type()))) {}
 
   ATensor call() {
     return converted_;
@@ -165,7 +146,6 @@ struct type_convert<
  private:
   ETensor value_;
   at::Tensor converted_;
-  std::vector<int64_t> sizes_;
 };
 
 // Optionals: ATen to ETen.
@@ -258,7 +238,12 @@ struct wrapper_impl<R (*)(Args...), f, int, N> {
   using TupleArgsType = std::tuple<typename type_map<Args>::type...>;
   static constexpr size_t num_args = sizeof...(Args);
   static_assert(
-      (N < num_args && std::is_same_v<element_t<N, typelist<Args...>>, R>) ||
+      (N < num_args &&
+       std::is_same_v<
+           executorch::extension::kernel_util_internal::element_t<
+               N,
+               executorch::extension::kernel_util_internal::typelist<Args...>>,
+           R>) ||
           N == -1,
       "The index of the out tensor can't be greater or equal to num_args and "
       "the Nth argument type has to be the same as the return type.");
@@ -298,16 +283,18 @@ struct wrapper_impl<R (*)(Args...), f, int, N> {
   }
 };
 
-} // namespace executor
-} // namespace torch
+} // namespace internal
+} // namespace extension
+} // namespace executorch
 
 // Wrapper macro for out variant function. N is the index of the out tensor.
 // We need N to know how to preserve the semantics of modifying out tensor and
 // return the reference without allocating a new memory buffer for out tensor.
-#define _WRAP_2(func, N) \
-  ::torch::executor::wrapper_impl<decltype(&func), func, decltype(N), N>::wrap
+#define _WRAP_2(func, N)              \
+  ::executorch::extension::internal:: \
+      wrapper_impl<decltype(&func), func, decltype(N), N>::wrap
 #define _WRAP_1(func) \
-  ::torch::executor::wrapper_impl<decltype(&func), func>::wrap
+  ::executorch::extension::internal::wrapper_impl<decltype(&func), func>::wrap
 
-#define GET_MACRO(_1, _2, NAME, ...) NAME
-#define WRAP_TO_ATEN(...) GET_MACRO(__VA_ARGS__, _WRAP_2, _WRAP_1)(__VA_ARGS__)
+#define _GET_MACRO(_1, _2, NAME, ...) NAME
+#define WRAP_TO_ATEN(...) _GET_MACRO(__VA_ARGS__, _WRAP_2, _WRAP_1)(__VA_ARGS__)
diff --git a/extension/aten_util/targets.bzl b/extension/aten_util/targets.bzl
index b396cb78325..f219d6253f2 100644
--- a/extension/aten_util/targets.bzl
+++ b/extension/aten_util/targets.bzl
@@ -27,6 +27,7 @@ def define_common_targets():
         ],
         exported_deps = [
             "//executorch/extension/kernel_util:kernel_util",
+            "//executorch/extension/tensor:tensor",
             "//executorch/runtime/core:core",
             "//executorch/runtime/core:evalue",
             "//executorch/runtime/core/exec_aten:lib",
diff --git a/extension/aten_util/test/aten_bridge_test.cpp b/extension/aten_util/test/aten_bridge_test.cpp
index 5f52a063095..cf6d2b85978 100644
--- a/extension/aten_util/test/aten_bridge_test.cpp
+++ b/extension/aten_util/test/aten_bridge_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using namespace torch::util;
 using namespace torch::executor;
+using namespace torch::executor::util;
 
 namespace {
 at::Tensor generate_at_tensor() {
diff --git a/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp b/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp
index db99d9b49d0..26fe845a9e1 100644
--- a/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp
+++ b/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp
@@ -14,10 +14,11 @@
 #include <gtest/gtest.h>
 #include <torch/library.h>
 
-namespace torch {
-namespace executor {
-
 using namespace ::testing;
+using ::executorch::extension::internal::type_convert;
+using ::executorch::extension::internal::type_map;
+using ::torch::executor::ScalarType;
+using ::torch::executor::Tensor;
 
 Tensor& my_op_out(const Tensor& a, Tensor& out) {
   (void)a;
@@ -420,6 +421,3 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestWrap_ArrayRefOptional) {
   EXPECT_EQ(stack.size(), 1);
   EXPECT_EQ(stack[0].toTensor().const_data_ptr<int64_t>()[0], 4);
 }
-
-} // namespace executor
-} // namespace torch
diff --git a/extension/aten_util/test/targets.bzl b/extension/aten_util/test/targets.bzl
index b724bbce2bc..db2247fd60b 100644
--- a/extension/aten_util/test/targets.bzl
+++ b/extension/aten_util/test/targets.bzl
@@ -18,7 +18,6 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten:lib",
             "//executorch/runtime/kernel:operator_registry",
             "//executorch/extension/aten_util:aten_bridge",
-            "//executorch/extension/runner_util:managed_tensor",
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
         ],
         external_deps = [
diff --git a/extension/benchmark/README.md b/extension/benchmark/README.md
new file mode 100644
index 00000000000..14b23adfbe2
--- /dev/null
+++ b/extension/benchmark/README.md
@@ -0,0 +1,76 @@
+# Benchmarking Infrastructure (Experimental)
+
+The ExecuTorch project introduces an advanced benchmarking infrastructure designed to measure the performance of models on Android and iOS devices. It supports various backend delegates and devices, enabling reproducible performance measurements and facilitating collaborative efforts in performance tuning and debugging. This infrastructure is built on top of the [Nova reusable mobile workflow](https://github.com/pytorch/test-infra/wiki/Testing-Android-and-iOS-apps-on-OSS-CI-using-Nova-reusable-mobile-workflow) powered by PyTorch test-infra.
+
+### Key Features
+
+- **Multiple Models**: Supports a variety of ExecuTorch-enabled models such as `MobileNetV2` etc. Integration with compatible Hugging Face models is coming soon.
+
+- **Device Support**: Includes popular phones like latest Apple iPhone, Google Pixel, and Samsung Galaxy, etc.
+
+- **Backend Delegates**: Supports XNNPACK, Apple CoreML, Qualcomm QNN, and more in the near future.
+
+- **Benchmark Apps:** Generic apps that support both GenAI and non-GenAI models, capable of measuring performance offline. [Android App](../android/benchmark/) | [iOS App](../apple/Benchmark/). Popular Android and iOS profilers with in-depth performance analysis will be integrated with these apps in the future.
+
+- **Performance Monitoring**: Stores results in a database with a dashboard for tracking performance and detecting regressions.
+
+> **Disclaimer:** The infrastructure is new and experimental. We're working on improving its accessibility and stability over time. The models performance tuning is still a work in progress. The benchmark numbers presented in this document are not vetted yet and should be treated as preliminary results.
+
+
+## Preliminary Benchmark Results
+
+Below is a table summarizing some example data points obtained via the infra. These numbers represent model load time and average inference latency across different platforms and backends.
+
+| Model                 | Backend      | Model Load Time (ms) | Avg Inference Latency (ms) | Device                 |
+|-----------------------|--------------|----------------------|----------------------------|------------------------|
+| MobileNetV3 (mv3)     | XNNPACK Q8   | [34.024](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218988461)               | [252.628](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218988461)                    | Samsung S22            |
+| MobileNetV3 (mv3)     | QNN FP16     | [168.414](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218987785)              | [1.182](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218987785)                      | Samsung S22            |
+| MobileNetV3 (mv3)     | COREML FP16  | [61.583](https://github.com/pytorch/executorch/actions/runs/10836589365/job/30078681158)               | [682.197](https://github.com/pytorch/executorch/actions/runs/10836589365/job/30078681158)                    | Apple iPhone 15 Pro    |
+| MobileNetV2 (mv2)     | XNNPACK Q8   | [14.397](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218987379)               | [10.796](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218987379)                     | Samsung S22            |
+| MobileNetV2 (mv2)     | QNN FP16     | [136.862](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218987097)              | [0.673](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218987097)                      | Samsung S22            |
+| MobileNetV2 (mv2)     | COREML FP16  | [63.019](https://github.com/pytorch/executorch/actions/runs/10892026450/job/30269520397)               | [551.031](https://github.com/pytorch/executorch/actions/runs/10892026450/job/30269520397)                    | Apple iPhone 15 Plus   |
+| InceptionV4 (ic4)     | XNNPACK Q8   | [87.617](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218986716)               | [117.937](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218986716)                    | Samsung S22            |
+| InceptionV4 (ic4)     | QNN FP16     | [163.943](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218986381)              | [2.734](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218986381)                      | Samsung S22            |
+| InceptionV4 (ic4)     | COREML FP16  | [134.402](https://github.com/pytorch/executorch/actions/runs/10892026450/job/30269518983)              | [1906.966](https://github.com/pytorch/executorch/actions/runs/10892026450/job/30269518983)                   | Apple iPhone 15 Pro    |
+| InceptionV3 (ic3)     | XNNPACK Q8   | [60.708](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218986023)               | [98.390](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218986023)                     | Samsung S22            |
+| InceptionV3 (ic3)     | QNN FP16     | [134.732](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218985425)              | [1.351](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218985425)                      | Samsung S22            |
+| InceptionV3 (ic3)     | COREML FP16  | [91.808](https://github.com/pytorch/executorch/actions/runs/10892026450/job/30269518511)               | [1500.712](https://github.com/pytorch/executorch/actions/runs/10892026450/job/30269518511)                   | Apple iPhone 15 Pro    |
+| DeepLabV3 (dl3)       | XNNPACK Q8   | [90.616](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218985758)               | [666.219](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218985758)                    | Samsung S22            |
+| DeepLabV3 (dl3)       | QNN FP16     | [182.207](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218985141)              | [9.759](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218985141)                      | Samsung S22            |
+| ResNet50 (resnet50)   | XNNPACK Q8   | [55.462](https://github.com/pytorch/executorch/actions/runs/10911725781/job/30285857102)               | [37.021](https://github.com/pytorch/executorch/actions/runs/10911725781/job/30285857102)                     | Apple iPhone 15 Pro    |
+| ResNet50 (resnet50)   | COREML FP16  | [84.453](https://github.com/pytorch/executorch/actions/runs/10836589365/job/30078680731)               | [1329.846](https://github.com/pytorch/executorch/actions/runs/10836589365/job/30078680731)                   | Apple iPhone 15 Pro    |
+| TorchVisionVit (vit)  | QNN FP16     | [174.430](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218989581)              | [199.279](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218989581)                    | Samsung S22            |
+| Wave2Letter (w2l)     | XNNPACK Q8   | [33.913](https://github.com/pytorch/executorch/actions/runs/10857890364/job/30139445319)               | [135.584](https://github.com/pytorch/executorch/actions/runs/10857890364/job/30139445319)                    | Apple iPhone 15 Pro    |
+
+
+## Supported Use Cases
+
+The benchmarking infrastructure currently supports two major use-cases:
+
+- **On-Demand Model Benchmarking:** Users can trigger benchmarking requests via GitHub Actions workflow dispatch UI. This feature will help backend developers collaborate with the ExecuTorch team to debug performance issues and advance state-of-the-art (SOTA) performance.
+
+- **Automated Nightly Batched Benchmarking:** The infrastructure performs automated nightly benchmarking to track and monitor performance over time. This allows for consistent performance monitoring and regression detection.
+
+
+## High-Level Diagram
+
+![Benchmarking Infrastructure](../../docs/source/_static/img/benchmark-infra.png)
+
+
+## Scheduling On-Demand Benchmarking
+
+Users can schedule a benchmarking workflow on a pull request through GitHub Actions using the workflow dispatch UI. Follow the steps below to trigger benchmarking:
+1. Access `pytorch/executorch` repository on GitHub and navigate to the "Actions" tab.
+2. Select `android-perf` or `apple-perf` workflow from the list of workflows.
+3. Click "Run workflow" and fill in the required parameters for the model you want to benchmark, e.g. branch name, model name and delegate, and device pool, etc.
+
+> **Note:** Write permission to the repo will be needed in order to run the on-demand workflow.
+
+
+## Retrieving Benchmark Results
+
+Currently, retrieving benchmark results involves manually extracting the `benchmark_results.json` from the `Customer_Artifacts.zip` stored on AWS S3 from the benchmarking job. This process is not yet streamlined. We are working on simplifying this process and linking the results directly to the dashboard, which will be available soon.
+
+
+## Feedback and Issue Reporting
+We encourage users to share feedback or report any issues while using the infra. Please submit your feedback via GitHub Issues.
diff --git a/extension/data_loader/buffer_data_loader.h b/extension/data_loader/buffer_data_loader.h
index 17ca36386d3..ee25d86526a 100644
--- a/extension/data_loader/buffer_data_loader.h
+++ b/extension/data_loader/buffer_data_loader.h
@@ -14,9 +14,8 @@
 #include <executorch/runtime/platform/log.h>
 #include <cstring>
 
-namespace torch {
-namespace executor {
-namespace util {
+namespace executorch {
+namespace extension {
 
 /**
  * A DataLoader that wraps a pre-allocated buffer. The FreeableBuffers
@@ -25,12 +24,13 @@ namespace util {
  * This can be used to wrap data that is directly embedded into the firmware
  * image, or to wrap data that was allocated elsewhere.
  */
-class BufferDataLoader final : public DataLoader {
+class BufferDataLoader final : public executorch::runtime::DataLoader {
  public:
   BufferDataLoader(const void* data, size_t size)
       : data_(reinterpret_cast<const uint8_t*>(data)), size_(size) {}
 
-  ET_NODISCARD Result<FreeableBuffer> load(
+  ET_NODISCARD
+  executorch::runtime::Result<executorch::runtime::FreeableBuffer> load(
       size_t offset,
       size_t size,
       ET_UNUSED const DataLoader::SegmentInfo& segment_info) const override {
@@ -41,14 +41,15 @@ class BufferDataLoader final : public DataLoader {
         offset,
         size,
         size_);
-    return FreeableBuffer(data_ + offset, size, /*free_fn=*/nullptr);
+    return executorch::runtime::FreeableBuffer(
+        data_ + offset, size, /*free_fn=*/nullptr);
   }
 
-  ET_NODISCARD Result<size_t> size() const override {
+  ET_NODISCARD executorch::runtime::Result<size_t> size() const override {
     return size_;
   }
 
-  ET_NODISCARD Error load_into(
+  ET_NODISCARD executorch::runtime::Error load_into(
       size_t offset,
       size_t size,
       ET_UNUSED const SegmentInfo& segment_info,
@@ -63,7 +64,7 @@ class BufferDataLoader final : public DataLoader {
       return result.error();
     }
     std::memcpy(buffer, result->data(), size);
-    return Error::Ok;
+    return executorch::runtime::Error::Ok;
   }
 
  private:
@@ -71,6 +72,15 @@ class BufferDataLoader final : public DataLoader {
   const size_t size_;
 };
 
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace util {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::BufferDataLoader;
 } // namespace util
 } // namespace executor
 } // namespace torch
diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp
index b58edaa2dee..1d097cfd989 100644
--- a/extension/data_loader/file_data_loader.cpp
+++ b/extension/data_loader/file_data_loader.cpp
@@ -34,9 +34,13 @@
 #define ET_HAVE_PREAD 1
 #endif // !ET_HAVE_PREAD
 
-namespace torch {
-namespace executor {
-namespace util {
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
+
+namespace executorch {
+namespace extension {
+
 namespace {
 
 /**
@@ -287,6 +291,5 @@ ET_NODISCARD Error FileDataLoader::load_into(
   return Error::Ok;
 }
 
-} // namespace util
-} // namespace executor
-} // namespace torch
+} // namespace extension
+} // namespace executorch
diff --git a/extension/data_loader/file_data_loader.h b/extension/data_loader/file_data_loader.h
index 12e0fcae49b..7cf2a92c4ad 100644
--- a/extension/data_loader/file_data_loader.h
+++ b/extension/data_loader/file_data_loader.h
@@ -14,9 +14,8 @@
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/compiler.h>
 
-namespace torch {
-namespace executor {
-namespace util {
+namespace executorch {
+namespace extension {
 
 /**
  * A DataLoader that loads segments from a file, allocating the memory
@@ -25,7 +24,7 @@ namespace util {
  * Note that this will keep the file open for the duration of its lifetime, to
  * avoid the overhead of opening it again for every load() call.
  */
-class FileDataLoader final : public DataLoader {
+class FileDataLoader final : public executorch::runtime::DataLoader {
  public:
   /**
    * Creates a new FileDataLoader that wraps the named file.
@@ -40,12 +39,12 @@ class FileDataLoader final : public DataLoader {
    *     could not be found.
    * @retval Error::MemoryAllocationFailed Internal memory allocation failure.
    */
-  static Result<FileDataLoader> from(
+  static executorch::runtime::Result<FileDataLoader> from(
       const char* file_name,
       size_t alignment = alignof(std::max_align_t));
 
   /// DEPRECATED: Use the lowercase `from()` instead.
-  ET_DEPRECATED static Result<FileDataLoader> From(
+  ET_DEPRECATED static executorch::runtime::Result<FileDataLoader> From(
       const char* file_name,
       size_t alignment = alignof(std::max_align_t)) {
     return from(file_name, alignment);
@@ -65,14 +64,15 @@ class FileDataLoader final : public DataLoader {
 
   ~FileDataLoader() override;
 
-  ET_NODISCARD Result<FreeableBuffer> load(
+  ET_NODISCARD
+  executorch::runtime::Result<executorch::runtime::FreeableBuffer> load(
       size_t offset,
       size_t size,
       const DataLoader::SegmentInfo& segment_info) const override;
 
-  ET_NODISCARD Result<size_t> size() const override;
+  ET_NODISCARD executorch::runtime::Result<size_t> size() const override;
 
-  ET_NODISCARD Error load_into(
+  ET_NODISCARD executorch::runtime::Error load_into(
       size_t offset,
       size_t size,
       ET_UNUSED const SegmentInfo& segment_info,
@@ -100,6 +100,15 @@ class FileDataLoader final : public DataLoader {
   const int fd_; // Owned by the instance.
 };
 
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace util {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::FileDataLoader;
 } // namespace util
 } // namespace executor
 } // namespace torch
diff --git a/extension/data_loader/mmap_data_loader.cpp b/extension/data_loader/mmap_data_loader.cpp
index ce523f6e9be..ebe74f95266 100644
--- a/extension/data_loader/mmap_data_loader.cpp
+++ b/extension/data_loader/mmap_data_loader.cpp
@@ -22,9 +22,12 @@
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/log.h>
 
-namespace torch {
-namespace executor {
-namespace util {
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
+
+namespace executorch {
+namespace extension {
 
 namespace {
 
@@ -254,6 +257,5 @@ Result<size_t> MmapDataLoader::size() const {
   return file_size_;
 }
 
-} // namespace util
-} // namespace executor
-} // namespace torch
+} // namespace extension
+} // namespace executorch
diff --git a/extension/data_loader/mmap_data_loader.h b/extension/data_loader/mmap_data_loader.h
index 04a2514c77d..c55f81a490b 100644
--- a/extension/data_loader/mmap_data_loader.h
+++ b/extension/data_loader/mmap_data_loader.h
@@ -12,9 +12,8 @@
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/compiler.h>
 
-namespace torch {
-namespace executor {
-namespace util {
+namespace executorch {
+namespace extension {
 
 /**
  * A DataLoader that loads segments from a file, allocating the memory
@@ -23,7 +22,7 @@ namespace util {
  * Note that this will keep the file open for the duration of its lifetime, to
  * avoid the overhead of opening it again for every load() call.
  */
-class MmapDataLoader final : public DataLoader {
+class MmapDataLoader final : public executorch::runtime::DataLoader {
  public:
   /**
    * Describes how and whether to lock loaded pages with `mlock()`.
@@ -51,12 +50,12 @@ class MmapDataLoader final : public DataLoader {
    * @param[in] mlock_config How and whether to lock loaded pages with
    *     `mlock()`.
    */
-  static Result<MmapDataLoader> from(
+  static executorch::runtime::Result<MmapDataLoader> from(
       const char* file_name,
       MlockConfig mlock_config = MlockConfig::UseMlock);
 
   /// DEPRECATED: Use the lowercase `from()` instead.
-  ET_DEPRECATED static Result<MmapDataLoader> From(
+  ET_DEPRECATED static executorch::runtime::Result<MmapDataLoader> From(
       const char* file_name,
       MlockConfig mlock_config = MlockConfig::UseMlock) {
     return from(file_name, mlock_config);
@@ -64,7 +63,9 @@ class MmapDataLoader final : public DataLoader {
 
   /// DEPRECATED: Use the version of `from()` that takes an MlockConfig.
   ET_DEPRECATED
-  static Result<MmapDataLoader> From(const char* file_name, bool use_mlock) {
+  static executorch::runtime::Result<MmapDataLoader> From(
+      const char* file_name,
+      bool use_mlock) {
     MlockConfig mlock_config =
         use_mlock ? MlockConfig::UseMlock : MlockConfig::NoMlock;
     return from(file_name, mlock_config);
@@ -86,12 +87,13 @@ class MmapDataLoader final : public DataLoader {
 
   ~MmapDataLoader() override;
 
-  ET_NODISCARD Result<FreeableBuffer> load(
+  ET_NODISCARD
+  executorch::runtime::Result<executorch::runtime::FreeableBuffer> load(
       size_t offset,
       size_t size,
       const DataLoader::SegmentInfo& segment_info) const override;
 
-  ET_NODISCARD Result<size_t> size() const override;
+  ET_NODISCARD executorch::runtime::Result<size_t> size() const override;
 
  private:
   MmapDataLoader(
@@ -118,6 +120,15 @@ class MmapDataLoader final : public DataLoader {
   const MlockConfig mlock_config_;
 };
 
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace util {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::MmapDataLoader;
 } // namespace util
 } // namespace executor
 } // namespace torch
diff --git a/extension/data_loader/shared_ptr_data_loader.h b/extension/data_loader/shared_ptr_data_loader.h
index 79a329084f3..551ab4d498c 100644
--- a/extension/data_loader/shared_ptr_data_loader.h
+++ b/extension/data_loader/shared_ptr_data_loader.h
@@ -14,9 +14,8 @@
 #include <executorch/runtime/platform/log.h>
 #include <memory>
 
-namespace torch {
-namespace executor {
-namespace util {
+namespace executorch {
+namespace extension {
 
 /**
  * A DataLoader that wraps a pre-allocated buffer and shares ownership to it.
@@ -24,12 +23,13 @@ namespace util {
  *
  * This can be used to wrap data that was allocated elsewhere.
  */
-class SharedPtrDataLoader final : public DataLoader {
+class SharedPtrDataLoader final : public executorch::runtime::DataLoader {
  public:
   SharedPtrDataLoader(std::shared_ptr<void> data, size_t size)
       : data_(data), size_(size) {}
 
-  ET_NODISCARD Result<FreeableBuffer> load(
+  ET_NODISCARD
+  executorch::runtime::Result<executorch::runtime::FreeableBuffer> load(
       size_t offset,
       size_t size,
       ET_UNUSED const DataLoader::SegmentInfo& segment_info) const override {
@@ -40,11 +40,11 @@ class SharedPtrDataLoader final : public DataLoader {
         offset,
         size,
         size_);
-    return FreeableBuffer(
+    return executorch::runtime::FreeableBuffer(
         static_cast<uint8_t*>(data_.get()) + offset, size, /*free_fn=*/nullptr);
   }
 
-  ET_NODISCARD Result<size_t> size() const override {
+  ET_NODISCARD executorch::runtime::Result<size_t> size() const override {
     return size_;
   }
 
@@ -53,6 +53,15 @@ class SharedPtrDataLoader final : public DataLoader {
   const size_t size_;
 };
 
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace util {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::SharedPtrDataLoader;
 } // namespace util
 } // namespace executor
 } // namespace torch
diff --git a/extension/data_loader/test/buffer_data_loader_test.cpp b/extension/data_loader/test/buffer_data_loader_test.cpp
index e5facfc3ba0..83d053ee466 100644
--- a/extension/data_loader/test/buffer_data_loader_test.cpp
+++ b/extension/data_loader/test/buffer_data_loader_test.cpp
@@ -16,18 +16,18 @@
 #include <executorch/runtime/platform/runtime.h>
 
 using namespace ::testing;
-using torch::executor::DataLoader;
-using torch::executor::Error;
-using torch::executor::FreeableBuffer;
-using torch::executor::Result;
-using torch::executor::util::BufferDataLoader;
+using executorch::extension::BufferDataLoader;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
 
 class BufferDataLoaderTest : public ::testing::Test {
  protected:
   void SetUp() override {
     // Since these tests cause ET_LOG to be called, the PAL must be initialized
     // first.
-    torch::executor::runtime_init();
+    executorch::runtime::runtime_init();
   }
 };
 
diff --git a/extension/data_loader/test/file_data_loader_test.cpp b/extension/data_loader/test/file_data_loader_test.cpp
index f7081565fd3..1d4f4c16196 100644
--- a/extension/data_loader/test/file_data_loader_test.cpp
+++ b/extension/data_loader/test/file_data_loader_test.cpp
@@ -18,19 +18,19 @@
 #include <executorch/test/utils/alignment.h>
 
 using namespace ::testing;
-using torch::executor::DataLoader;
-using torch::executor::Error;
-using torch::executor::FreeableBuffer;
-using torch::executor::Result;
-using torch::executor::testing::TempFile;
-using torch::executor::util::FileDataLoader;
+using executorch::extension::FileDataLoader;
+using executorch::extension::testing::TempFile;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
 
 class FileDataLoaderTest : public ::testing::TestWithParam<size_t> {
  protected:
   void SetUp() override {
     // Since these tests cause ET_LOG to be called, the PAL must be initialized
     // first.
-    torch::executor::runtime_init();
+    executorch::runtime::runtime_init();
   }
 
   // The alignment in bytes that tests should use. The values are set by the
diff --git a/extension/data_loader/test/mmap_data_loader_test.cpp b/extension/data_loader/test/mmap_data_loader_test.cpp
index b6781bc4482..a76121109a8 100644
--- a/extension/data_loader/test/mmap_data_loader_test.cpp
+++ b/extension/data_loader/test/mmap_data_loader_test.cpp
@@ -19,19 +19,19 @@
 #include <executorch/runtime/platform/runtime.h>
 
 using namespace ::testing;
-using torch::executor::DataLoader;
-using torch::executor::Error;
-using torch::executor::FreeableBuffer;
-using torch::executor::Result;
-using torch::executor::testing::TempFile;
-using torch::executor::util::MmapDataLoader;
+using executorch::extension::MmapDataLoader;
+using executorch::extension::testing::TempFile;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
 
 class MmapDataLoaderTest : public ::testing::Test {
  protected:
   void SetUp() override {
     // Since these tests cause ET_LOG to be called, the PAL must be initialized
     // first.
-    torch::executor::runtime_init();
+    executorch::runtime::runtime_init();
 
     // Get the page size and ensure it's a power of 2.
     long page_size = sysconf(_SC_PAGESIZE);
diff --git a/extension/data_loader/test/shared_ptr_data_loader_test.cpp b/extension/data_loader/test/shared_ptr_data_loader_test.cpp
index b4fc153cab3..62d71ae0560 100644
--- a/extension/data_loader/test/shared_ptr_data_loader_test.cpp
+++ b/extension/data_loader/test/shared_ptr_data_loader_test.cpp
@@ -17,18 +17,18 @@
 #include <executorch/runtime/platform/runtime.h>
 
 using namespace ::testing;
-using torch::executor::DataLoader;
-using torch::executor::Error;
-using torch::executor::FreeableBuffer;
-using torch::executor::Result;
-using torch::executor::util::SharedPtrDataLoader;
+using executorch::extension::SharedPtrDataLoader;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
 
 class SharedPtrDataLoaderTest : public ::testing::Test {
  protected:
   void SetUp() override {
     // Since these tests cause ET_LOG to be called, the PAL must be initialized
     // first.
-    torch::executor::runtime_init();
+    executorch::runtime::runtime_init();
   }
 };
 
diff --git a/extension/export_util/export_hf_model.py b/extension/export_util/export_hf_model.py
new file mode 100644
index 00000000000..12ed202988c
--- /dev/null
+++ b/extension/export_util/export_hf_model.py
@@ -0,0 +1,110 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+import torch.export._trace
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge
+from torch.nn.attention import SDPBackend
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.integrations.executorch import convert_and_export_with_cache
+from transformers.modeling_utils import PreTrainedModel
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-hfm",
+        "--hf_model_repo",
+        required=True,
+        default=None,
+        help="a valid huggingface model repo name",
+    )
+    parser.add_argument(
+        "-o",
+        "--output_name",
+        required=False,
+        default=None,
+        help="output name of the exported model",
+    )
+
+    args = parser.parse_args()
+
+    # Configs to HF model
+    device = "cpu"
+    dtype = torch.float32
+    batch_size = 1
+    max_length = 123
+    cache_implementation = "static"
+    attn_implementation = "sdpa"
+
+    # Load and configure a HF model
+    model = AutoModelForCausalLM.from_pretrained(
+        args.hf_model_repo,
+        attn_implementation=attn_implementation,
+        device_map=device,
+        torch_dtype=dtype,
+        generation_config=GenerationConfig(
+            use_cache=True,
+            cache_implementation=cache_implementation,
+            max_length=max_length,
+            cache_config={
+                "batch_size": batch_size,
+                "max_cache_len": max_length,
+            },
+        ),
+    )
+    print(f"{model.config}")
+    print(f"{model.generation_config}")
+
+    tokenizer = AutoTokenizer.from_pretrained(args.hf_model_repo)
+    input_ids = tokenizer([""], return_tensors="pt").to(device)["input_ids"]
+    cache_position = torch.tensor([0], dtype=torch.long)
+
+    def _get_constant_methods(model: PreTrainedModel):
+        return {
+            "get_dtype": 5 if model.config.torch_dtype == torch.float16 else 6,
+            "get_bos_id": model.config.bos_token_id,
+            "get_eos_id": model.config.eos_token_id,
+            "get_head_dim": model.config.hidden_size / model.config.num_attention_heads,
+            "get_max_batch_size": model.generation_config.cache_config.batch_size,
+            "get_max_seq_len": model.generation_config.cache_config.max_cache_len,
+            "get_n_bos": 1,
+            "get_n_eos": 1,
+            "get_n_kv_heads": model.config.num_key_value_heads,
+            "get_n_layers": model.config.num_hidden_layers,
+            "get_vocab_size": model.config.vocab_size,
+            "use_kv_cache": model.generation_config.use_cache,
+        }
+
+    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+
+        exported_prog = convert_and_export_with_cache(model, input_ids, cache_position)
+        prog = (
+            to_edge(
+                exported_prog,
+                compile_config=EdgeCompileConfig(
+                    _check_ir_validity=False,
+                    _skip_dim_order=True,
+                ),
+                constant_methods=_get_constant_methods(model),
+            )
+            .to_backend(XnnpackPartitioner())
+            .to_executorch(ExecutorchBackendConfig(extract_delegate_segments=True))
+        )
+        out_name = args.output_name if args.output_name else model.config.model_type
+        filename = os.path.join("./", f"{out_name}.pte")
+        with open(filename, "wb") as f:
+            prog.write_to_file(f)
+            print(f"Saved exported program to {filename}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extension/export_util/utils.py b/extension/export_util/utils.py
index 5c2700e6f5e..40ceb6ffec2 100644
--- a/extension/export_util/utils.py
+++ b/extension/export_util/utils.py
@@ -14,8 +14,7 @@
 import torch
 from executorch.exir import EdgeProgramManager, ExecutorchProgramManager, to_edge
 from executorch.exir.tracer import Value
-from torch._export import capture_pre_autograd_graph
-from torch.export import export, ExportedProgram
+from torch.export import export, export_for_training, ExportedProgram
 
 
 _EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
@@ -63,7 +62,7 @@ def _core_aten_to_edge(
         compile_config=edge_compile_config,
     )
     if verbose:
-        logging.info(f"Exported graph:\n{edge_manager.exported_program().graph}")
+        logging.info(f"Exported graph:\n{edge_manager.exported_program()}")
     return edge_manager
 
 
@@ -95,7 +94,7 @@ def export_to_exec_prog(
 ) -> ExecutorchProgramManager:
     m = model.eval()
     # pre-autograd export. eventually this will become torch.export
-    m = capture_pre_autograd_graph(m, example_inputs)
+    m = export_for_training(m, example_inputs).module()
 
     core_aten_ep = _to_core_aten(m, example_inputs, dynamic_shapes, strict=strict)
 
diff --git a/extension/gguf_util/converters/llama_converter.py b/extension/gguf_util/converters/llama_converter.py
index dc16cd7dff3..463e5a0fcfe 100644
--- a/extension/gguf_util/converters/llama_converter.py
+++ b/extension/gguf_util/converters/llama_converter.py
@@ -99,8 +99,8 @@ def convert_to_pte(gguf_model_args: GGUFModelArgs, gguf_weights: GGUFWeights) ->
     """Convert a GGUF model into an ExecuTorch program.
 
     Args:
-        model_args: The arguments for the GGUF model.
-        weights: The weights of the GGUF model.
+        gguf_model_args: The arguments for the GGUF model.
+        gguf_weights: The weights of the GGUF model.
     """
 
     assert (
diff --git a/extension/kernel_util/README.md b/extension/kernel_util/README.md
index a3a1e653bdb..be6afb58fa6 100644
--- a/extension/kernel_util/README.md
+++ b/extension/kernel_util/README.md
@@ -2,10 +2,10 @@ This header file `make_boxed_from_unboxed_functor.h` defines a template that can
 ## Requirements
 This header requires C++17 or later.
 ## Usage
-The template takes an unboxed function pointer and wraps it into a functor that takes `RuntimeContext` and `EValues` as inputs and returns void. The wrapped functor will unbox all inputs and forward them to the unboxed kernel.
+The template takes an unboxed function pointer and wraps it into a functor that takes `KernelRuntimeContext` and `EValues` as inputs and returns void. The wrapped functor will unbox all inputs and forward them to the unboxed kernel.
 Here is an example of how to use the template:
 ```C++
-Tensor& my_op(RuntimeContext& ctx, const Tensor& self, const Tensor& other, Tensor& out) {
+Tensor& my_op(KernelRuntimeContext& ctx, const Tensor& self, const Tensor& other, Tensor& out) {
   // ...
   return out;
 }
@@ -17,7 +17,7 @@ Alternatively, you can use the EXECUTORCH_LIBRARY macro to simplify the process:
 EXECUTORCH_LIBRARY(my_ns, "my_op", my_op);
 ```
 ## Details
-The template uses a lot of C++17 features to convert each EValue to the inferred argument type. It checks if the first argument is `RuntimeContext`, and if so, it removes it. The call method of the `WrapUnboxedIntoFunctor` struct calls the unboxed function with the corresponding arguments.
+The template uses a lot of C++17 features to convert each EValue to the inferred argument type. It checks if the first argument is `KernelRuntimeContext`, and if so, it removes it. The call method of the `WrapUnboxedIntoFunctor` struct calls the unboxed function with the corresponding arguments.
 The `EXECUTORCH_LIBRARY` macro registers the kernel for the operation and stores the result in a static variable.
 ## Note
-The `RuntimeContext` is a placeholder for a context that will be passed to kernels. It is currently empty, but it is planned to be used for kernel temp memory allocation and error handling in the future.
+The `KernelRuntimeContext` is a context object that lets kernels handle errors and allocate temp memory. It can be used to add support for other actions in the future.
diff --git a/extension/kernel_util/make_boxed_from_unboxed_functor.h b/extension/kernel_util/make_boxed_from_unboxed_functor.h
index 0202b4f51cb..409c981cbb1 100644
--- a/extension/kernel_util/make_boxed_from_unboxed_functor.h
+++ b/extension/kernel_util/make_boxed_from_unboxed_functor.h
@@ -54,10 +54,13 @@ class KernelRuntimeContext; // Forward declaration
 } // namespace runtime
 } // namespace executorch
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+
+// This extension has a lot of generic internal names like "size"; use a unique
+// internal namespace to avoid conflicts with other extensions.
+namespace kernel_util_internal {
 
-// evalue_to_arg
 template <class T>
 struct decay_if_not_tensor final {
   using type = std::decay_t<T>;
@@ -73,45 +76,44 @@ struct decay_if_not_tensor<const exec_aten::Tensor&> final {
 
 template <class T>
 struct evalue_to_arg final {
-  static T call(EValue& v) {
+  static T call(executorch::runtime::EValue& v) {
     return std::move(v).to<T>();
   }
 };
 
 template <>
 struct evalue_to_arg<exec_aten::Tensor&> final {
-  static exec_aten::Tensor& call(EValue& v) {
+  static exec_aten::Tensor& call(executorch::runtime::EValue& v) {
     return v.toTensor();
   }
 };
 
 template <>
 struct evalue_to_arg<const exec_aten::Tensor&> final {
-  static const exec_aten::Tensor& call(EValue& v) {
+  static const exec_aten::Tensor& call(executorch::runtime::EValue& v) {
     return v.toTensor();
   }
 };
 
 template <class T>
 struct evalue_to_arg<exec_aten::optional<T>> final {
-  static exec_aten::optional<T> call(EValue& v) {
+  static exec_aten::optional<T> call(executorch::runtime::EValue& v) {
     return v.toOptional<T>();
   }
 };
 
 template <class T>
 struct evalue_to_arg<exec_aten::ArrayRef<exec_aten::optional<T>>> final {
-  static exec_aten::ArrayRef<exec_aten::optional<T>> call(EValue& v) {
+  static exec_aten::ArrayRef<exec_aten::optional<T>> call(
+      executorch::runtime::EValue& v) {
     return v.toListOptionalTensor();
   }
 };
 
-// Call functor with args from stack
-
 template <class Functor, size_t... evalue_arg_indices, typename... ArgTypes>
-void call_functor_with_args_from_stack_(
+void call_functor_with_args_from_stack(
     ::executorch::runtime::KernelRuntimeContext& ctx,
-    EValue** stack,
+    executorch::runtime::EValue** stack,
     std::index_sequence<evalue_arg_indices...>,
     typelist<ArgTypes...>*) {
   (*Functor::func_ptr())(
@@ -120,6 +122,8 @@ void call_functor_with_args_from_stack_(
           *stack[evalue_arg_indices])...);
 }
 
+} // namespace kernel_util_internal
+
 /**
  * WrapUnboxedIntoFunctor: Given a function pointer, wrap it into a functor that
  * takes EValues as input and returns void. The wrapped functor will unbox all
@@ -128,25 +132,29 @@ void call_functor_with_args_from_stack_(
 template <class FuncType>
 struct WrapUnboxedIntoFunctor {
   static_assert(
-      is_compile_time_function_pointer<FuncType>::value,
+      kernel_util_internal::is_compile_time_function_pointer<FuncType>::value,
       "Can't handle function other than EXECUTORCH_FN");
   using TrueType = typename FuncType::FuncType;
-  using ReturnType = typename infer_function_traits_t<TrueType>::return_type;
-  using ArgsType = typename infer_function_traits_t<TrueType>::parameter_types;
+  using ReturnType = typename kernel_util_internal::infer_function_traits_t<
+      TrueType>::return_type;
+  using ArgsType = typename kernel_util_internal::infer_function_traits_t<
+      TrueType>::parameter_types;
   // check if the first argument is KernelRuntimeContext, if so, remove it
   static constexpr bool first_arg_is_context = std::is_same<
       ::executorch::runtime::KernelRuntimeContext,
-      std::remove_reference_t<head_with_default_t<void, ArgsType>>>::value;
+      std::remove_reference_t<
+          kernel_util_internal::head_with_default_t<void, ArgsType>>>::value;
   using ContextRemovedArgsType = std::conditional_t<
       first_arg_is_context,
-      drop_if_nonempty_t<ArgsType, 1>,
+      kernel_util_internal::drop_if_nonempty_t<ArgsType, 1>,
       ArgsType>;
 
   static void call(
       ::executorch::runtime::KernelRuntimeContext& ctx,
-      EValue** stack) {
-    constexpr size_t num_inputs = size<ContextRemovedArgsType>::value;
-    return call_functor_with_args_from_stack_<FuncType>(
+      executorch::runtime::EValue** stack) {
+    constexpr size_t num_inputs =
+        kernel_util_internal::size<ContextRemovedArgsType>::value;
+    return kernel_util_internal::call_functor_with_args_from_stack<FuncType>(
         ctx,
         stack,
         std::make_index_sequence<num_inputs>(),
@@ -155,14 +163,26 @@ struct WrapUnboxedIntoFunctor {
 };
 
 template <typename FuncType>
-static Kernel make_boxed_kernel(const char* name, FuncType) {
-  return Kernel(name, WrapUnboxedIntoFunctor<FuncType>::call);
+static executorch::runtime::Kernel make_boxed_kernel(
+    const char* name,
+    FuncType) {
+  return executorch::runtime::Kernel(
+      name, WrapUnboxedIntoFunctor<FuncType>::call);
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace extension
+} // namespace executorch
 
-#define EXECUTORCH_LIBRARY(ns, op_name, func)                 \
-  static auto res_##ns = ::torch::executor::register_kernels( \
-      ::torch::executor::make_boxed_kernel(                   \
+#define EXECUTORCH_LIBRARY(ns, op_name, func)                    \
+  static auto res_##ns = ::executorch::runtime::register_kernel( \
+      ::executorch::extension::make_boxed_kernel(                \
           #ns "::" op_name, EXECUTORCH_FN(func)))
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::make_boxed_kernel;
+using ::executorch::extension::WrapUnboxedIntoFunctor;
+} // namespace executor
+} // namespace torch
diff --git a/extension/kernel_util/meta_programming.h b/extension/kernel_util/meta_programming.h
index c412e907ea0..027568fe687 100644
--- a/extension/kernel_util/meta_programming.h
+++ b/extension/kernel_util/meta_programming.h
@@ -17,8 +17,11 @@
 #include <type_traits>
 #include <typeinfo>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+// This extension has a lot of generic internal names like "size"; use a unique
+// internal namespace to avoid conflicts with other extensions.
+namespace kernel_util_internal {
 
 // Check if a given type is a function
 template <class T>
@@ -48,9 +51,9 @@ template <class FuncType, FuncType* func_ptr>
 struct is_compile_time_function_pointer<
     CompileTimeFunctionPointer<FuncType, func_ptr>> : std::true_type {};
 
-#define EXECUTORCH_FN_TYPE(func)                                      \
-  ::torch::executor::CompileTimeFunctionPointer<                      \
-      std::remove_pointer_t<std::remove_reference_t<decltype(func)>>, \
+#define EXECUTORCH_FN_TYPE(func)                                             \
+  ::executorch::extension::kernel_util_internal::CompileTimeFunctionPointer< \
+      std::remove_pointer_t<std::remove_reference_t<decltype(func)>>,        \
       func>
 #define EXECUTORCH_FN(func) EXECUTORCH_FN_TYPE(func)()
 
@@ -111,5 +114,6 @@ struct infer_function_traits<Result(Args...)> {
 template <typename T>
 using infer_function_traits_t = typename infer_function_traits<T>::type;
 
-} // namespace executor
-} // namespace torch
+} // namespace kernel_util_internal
+} // namespace extension
+} // namespace executorch
diff --git a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
index b75a8f160a6..dce3694d517 100644
--- a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
+++ b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
@@ -15,22 +15,32 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using RuntimeContext = torch::executor::KernelRuntimeContext;
-using namespace torch::executor;
-
-Tensor& my_op_out(RuntimeContext& ctx, const Tensor& a, Tensor& out) {
+using exec_aten::ArrayRef;
+using exec_aten::optional;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using exec_aten::TensorImpl;
+using executorch::runtime::BoxedEvalueList;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::get_op_function_from_registry;
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::registry_has_op_function;
+
+Tensor& my_op_out(KernelRuntimeContext& ctx, const Tensor& a, Tensor& out) {
   (void)ctx;
   (void)a;
   return out;
 }
 
-Tensor& set_1_out(RuntimeContext& ctx, Tensor& out) {
+Tensor& set_1_out(KernelRuntimeContext& ctx, Tensor& out) {
   (void)ctx;
   out.mutable_data_ptr<int32_t>()[0] = 1;
   return out;
 }
 
-Tensor& add_tensor_out(RuntimeContext& ctx, ArrayRef<Tensor> a, Tensor& out) {
+Tensor&
+add_tensor_out(KernelRuntimeContext& ctx, ArrayRef<Tensor> a, Tensor& out) {
   (void)ctx;
   for (int i = 0; i < out.numel(); i++) {
     int sum = 0;
@@ -43,7 +53,7 @@ Tensor& add_tensor_out(RuntimeContext& ctx, ArrayRef<Tensor> a, Tensor& out) {
 }
 
 Tensor& add_optional_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     optional<int64_t> s1,
     optional<int64_t> s2,
     Tensor& out) {
@@ -58,7 +68,7 @@ Tensor& add_optional_scalar_out(
 }
 
 Tensor& add_optional_tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     ArrayRef<optional<Tensor>> a,
     Tensor& out) {
   (void)ctx;
@@ -82,12 +92,12 @@ class MakeBoxedFromUnboxedFunctorTest : public ::testing::Test {
 
 TEST_F(MakeBoxedFromUnboxedFunctorTest, Basic) {
   EXECUTORCH_LIBRARY(my_ns, "my_op.out", my_op_out);
-  EXPECT_TRUE(hasOpsFn("my_ns::my_op.out"));
+  EXPECT_TRUE(registry_has_op_function("my_ns::my_op.out"));
 }
 
 TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) {
   EXECUTORCH_LIBRARY(my_ns, "set_1.out", set_1_out);
-  EXPECT_TRUE(hasOpsFn("my_ns::set_1.out"));
+  EXPECT_TRUE(registry_has_op_function("my_ns::set_1.out"));
 
   // prepare out tensor
   TensorImpl::SizesType sizes[1] = {5};
@@ -97,16 +107,17 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) {
   auto a = Tensor(&a_impl);
 
   // get boxed callable
-  auto fn = getOpsFn("my_ns::set_1.out");
+  auto fn = get_op_function_from_registry("my_ns::set_1.out");
+  ASSERT_EQ(fn.error(), Error::Ok);
 
   // run it
-  RuntimeContext context;
+  KernelRuntimeContext context;
   EValue values[1];
   values[0] = a;
   EValue* stack[1];
   stack[0] = &values[0];
 
-  fn(context, stack);
+  (*fn)(context, stack);
 
   // check result
   EXPECT_EQ(a.const_data_ptr<int32_t>()[0], 1);
@@ -114,7 +125,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) {
 
 TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) {
   EXECUTORCH_LIBRARY(my_ns, "add_tensor.out", add_tensor_out);
-  EXPECT_TRUE(hasOpsFn("my_ns::add_tensor.out"));
+  EXPECT_TRUE(registry_has_op_function("my_ns::add_tensor.out"));
 
   // prepare ArrayRef input.
   torch::executor::testing::TensorFactory<ScalarType::Int> tf;
@@ -126,13 +137,14 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) {
   // prepare out tensor.
   EValue out(tf.zeros({5}));
 
-  auto fn = getOpsFn("my_ns::add_tensor.out");
+  auto fn = get_op_function_from_registry("my_ns::add_tensor.out");
+  ASSERT_EQ(fn.error(), Error::Ok);
 
   // run it.
-  RuntimeContext context;
+  KernelRuntimeContext context;
   EValue values[2] = {boxed_array_ref, out};
   EValue* stack[2] = {&values[0], &values[1]};
-  fn(context, stack);
+  (*fn)(context, stack);
 
   // check result.
   for (int i = 0; i < 5; i++) {
@@ -142,7 +154,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) {
 
 TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptional) {
   EXECUTORCH_LIBRARY(my_ns, "add_optional_scalar.out", add_optional_scalar_out);
-  EXPECT_TRUE(hasOpsFn("my_ns::add_optional_scalar.out"));
+  EXPECT_TRUE(registry_has_op_function("my_ns::add_optional_scalar.out"));
 
   // prepare optional input.
   EValue scalar((int64_t)3);
@@ -151,13 +163,14 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptional) {
   // prepare out tensor.
   torch::executor::testing::TensorFactory<ScalarType::Int> tf;
   EValue out(tf.ones({1}));
-  auto fn = getOpsFn("my_ns::add_optional_scalar.out");
+  auto fn = get_op_function_from_registry("my_ns::add_optional_scalar.out");
+  ASSERT_EQ(fn.error(), Error::Ok);
 
   // run it.
-  RuntimeContext context;
+  KernelRuntimeContext context;
   EValue values[3] = {scalar, scalar_none, out};
   EValue* stack[3] = {&values[0], &values[1], &values[2]};
-  fn(context, stack);
+  (*fn)(context, stack);
 
   // check result.
   EXPECT_EQ(stack[2]->toTensor().const_data_ptr<int32_t>()[0], 4);
@@ -165,7 +178,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptional) {
 
 TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptionalArrayRef) {
   EXECUTORCH_LIBRARY(my_ns, "add_optional_tensor.out", add_optional_tensor_out);
-  EXPECT_TRUE(hasOpsFn("my_ns::add_optional_tensor.out"));
+  EXPECT_TRUE(registry_has_op_function("my_ns::add_optional_tensor.out"));
 
   // prepare optional tensors.
   torch::executor::testing::TensorFactory<ScalarType::Int> tf;
@@ -177,13 +190,14 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptionalArrayRef) {
 
   // prepare out tensor.
   EValue out(tf.zeros({5}));
-  auto fn = getOpsFn("my_ns::add_optional_tensor.out");
+  auto fn = get_op_function_from_registry("my_ns::add_optional_tensor.out");
+  ASSERT_EQ(fn.error(), Error::Ok);
 
   // run it.
-  RuntimeContext context;
+  KernelRuntimeContext context;
   EValue values[2] = {boxed_array_ref, out};
   EValue* stack[2] = {&values[0], &values[1]};
-  fn(context, stack);
+  (*fn)(context, stack);
 
   // check result.
   for (int i = 0; i < 5; i++) {
diff --git a/extension/kernel_util/type_list.h b/extension/kernel_util/type_list.h
index f832ab9f267..300cbfcb7cb 100644
--- a/extension/kernel_util/type_list.h
+++ b/extension/kernel_util/type_list.h
@@ -20,8 +20,12 @@
 #include <type_traits>
 #include <utility>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+// This extension has a lot of generic internal names like "size"; use a unique
+// internal namespace to avoid conflicts with other extensions.
+namespace kernel_util_internal {
+
 /**
  * Type holding a list of types for compile time type computations
  *     constexpr size_t num = size<typelist<int, double>>::value;
@@ -139,5 +143,6 @@ struct drop_if_nonempty final {
 template <class TypeList, size_t num>
 using drop_if_nonempty_t = typename drop_if_nonempty<TypeList, num>::type;
 
-} // namespace executor
-} // namespace torch
+} // namespace kernel_util_internal
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index 3f242e3d7d7..a5face39f4a 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -47,17 +47,10 @@ list(APPEND custom_ops_libs eigen_blas)
 
 list(TRANSFORM _custom_ops__srcs PREPEND "${EXECUTORCH_ROOT}/")
 
-# TODO: Consider moving xnnpack/threadpool in a separate lib since it's now used
-# by custom ops too.
 if(NOT EXECUTORCH_BUILD_XNNPACK)
-  list(
-    APPEND
-    _custom_ops__srcs
-    "${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/threadpool.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/threadpool_guard.cpp"
-  )
+  list(APPEND custom_ops_libs extension_threadpool)
 else()
-  list(APPEND custom_ops_libs xnnpack_backend)
+  list(APPEND custom_ops_libs extension_threadpool xnnpack_backend)
 endif()
 
 add_library(custom_ops ${_custom_ops__srcs})
@@ -80,15 +73,17 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   # Add a AOT library
   find_package(Torch CONFIG REQUIRED)
   add_library(
-    custom_ops_aot_lib SHARED ${_custom_ops__srcs}
-                              ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp
+    custom_ops_aot_lib SHARED
+    ${_custom_ops__srcs} ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_fast_hadamard_transform_aten.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_tile_crop.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_tile_crop_aot.cpp
   )
   target_include_directories(
     custom_ops_aot_lib PUBLIC "${_common_include_directories}"
   )
   target_include_directories(
-    custom_ops_aot_lib
-    PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include"
+    custom_ops_aot_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include"
   )
   if(TARGET portable_lib)
     # If we have portable_lib built, custom_ops_aot_lib gives the ability to use
@@ -100,7 +95,15 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
     target_link_libraries(custom_ops_aot_lib PUBLIC executorch_no_prim_ops)
   endif()
 
-  target_link_libraries(custom_ops_aot_lib PUBLIC cpublas torch)
+  target_link_libraries(
+    custom_ops_aot_lib PUBLIC cpublas torch extension_tensor
+                              extension_threadpool
+  )
+  if(WIN32)
+    # There is no direct replacement for libpthread.so on Windows. For the
+    # Windows build, link directly against pthreadpool and cpuinfo.
+    target_link_libraries(custom_ops_aot_lib PUBLIC pthreadpool cpuinfo)
+  endif()
   target_compile_options(
     custom_ops_aot_lib
     PUBLIC -Wno-deprecated-declarations -fPIC -frtti -fexceptions
diff --git a/extension/llm/custom_ops/TARGETS b/extension/llm/custom_ops/TARGETS
index ff3fde6e2cc..8fe776ab095 100644
--- a/extension/llm/custom_ops/TARGETS
+++ b/extension/llm/custom_ops/TARGETS
@@ -14,7 +14,7 @@ runtime.python_test(
         "test_sdpa_with_kv_cache.py",
     ],
     preload_deps = [
-        ":custom_ops_aot_lib",
+        ":custom_ops_aot_lib_mkl_noomp",
         ":custom_ops_aot_py",
     ],
     deps = [
diff --git a/extension/llm/custom_ops/model_sharding.py b/extension/llm/custom_ops/model_sharding.py
new file mode 100644
index 00000000000..75d6fd25740
--- /dev/null
+++ b/extension/llm/custom_ops/model_sharding.py
@@ -0,0 +1,104 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+from typing import List
+
+import torch
+
+from executorch.backends.qualcomm.utils.constants import QCOM_QUANT_ATTRS
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.export.exported_program import ExportedProgram
+from torch.library import impl, Library
+
+
+fallback_op_lib = Library("llama", "DEF")
+# registering an operator.
+fallback_op_lib.define("fallback(Tensor input) -> Tensor")
+
+
+@impl(fallback_op_lib, "fallback")
+def fallback_impl(a: torch.Tensor) -> torch.Tensor:
+    return a
+
+
+# registering the out variant.
+fallback_op_lib.define("fallback.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)")
+
+
+@impl(fallback_op_lib, "fallback.out")
+def fallback_out_impl(a: torch.Tensor, *, out: torch.Tensor) -> torch.Tensor:
+    out.copy_(a)
+    return out
+
+
+class SplitGraph(ExportPass):
+    """
+    Class to split the model to multiple partitions.
+    Because there is limited memory on the device, it could
+    not load all llama model in one pte.
+    """
+
+    def __init__(self, shard_layers: List[int]):
+        super().__init__()
+        self.shard_layers = shard_layers
+
+    def _insert_fallback_op(
+        self, graph_module: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        """
+        Insert fallback op before layer that needs to be shard.
+        Example:
+            There is 12 layers llama model and num_sharding is 3.
+            The first partition will contain layers [0, 4) and embedding.
+            The second partition will contain layers [4, 8).
+            The third partition will contain layers [8, 12) and output.
+        """
+        pattern = r"layers.(\d+)"
+        prev_node = None
+        prev_layer = None
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function" or "nn_module_stack" not in node.meta:
+                continue
+
+            module_values_list = list(node.meta["nn_module_stack"].values())
+            full_qualified_name = module_values_list[-1][0]
+            # Search which layer this node belongs to
+            match = re.search(pattern, full_qualified_name)
+            if match is None:
+                continue
+
+            cur_layer = int(match.group(1))
+            # Check the current node which is the last node of the layer
+            if cur_layer in self.shard_layers and prev_layer == cur_layer - 1:
+                with graph_module.graph.inserting_after(prev_node):
+                    users = list(prev_node.users.keys())
+                    inserted_node = graph_module.graph.create_node(
+                        "call_function",
+                        exir_ops.edge.llama.fallback.default,
+                        (prev_node,),
+                    )
+                    inserted_node.meta["val"] = prev_node.meta["val"]
+                    if prev_node.meta.get(QCOM_QUANT_ATTRS, None):
+                        inserted_node.meta[QCOM_QUANT_ATTRS] = prev_node.meta[
+                            QCOM_QUANT_ATTRS
+                        ]
+                    for user in users:
+                        user.replace_input_with(prev_node, inserted_node)
+
+            prev_layer = cur_layer
+            prev_node = node
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        self._insert_fallback_op(graph_module)
+        graph_module.recompile()
+        return PassResult(graph_module, True)
+
+
+def split_graph(edge_program: ExportedProgram, num_layers: int, shares: int):
+    graph_module = edge_program.graph_module
+    shard_layers = list(range(0, num_layers, int(num_layers / shares)))
+    return SplitGraph(shard_layers)(graph_module)
diff --git a/extension/llm/custom_ops/op_fallback.cpp b/extension/llm/custom_ops/op_fallback.cpp
new file mode 100644
index 00000000000..6f09f6ec036
--- /dev/null
+++ b/extension/llm/custom_ops/op_fallback.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+#include <executorch/extension/llm/custom_ops/op_fallback.h>
+#include <cstring>
+
+namespace torch {
+namespace executor {
+
+namespace native {
+
+// Copy from op_clone.cpp
+Tensor& fallback_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, in.sizes()) == torch::executor::Error::Ok,
+      InvalidArgument,
+      out);
+
+  // The input and out shall share same dtype and size
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
+
+  if (in.nbytes() > 0) {
+    // Note that this check is important. It's valid for a tensor with numel 0
+    // to have a null data pointer, but in some environments it's invalid to
+    // pass a null pointer to memcpy() even when the size is zero.
+    memcpy(out.mutable_data_ptr(), in.const_data_ptr(), in.nbytes());
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
+
+EXECUTORCH_LIBRARY(
+    llama,
+    "fallback.out",
+    torch::executor::native::fallback_out);
diff --git a/extension/llm/custom_ops/op_fallback.h b/extension/llm/custom_ops/op_fallback.h
new file mode 100644
index 00000000000..c650df8c59d
--- /dev/null
+++ b/extension/llm/custom_ops/op_fallback.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+
+namespace native {
+Tensor& fallback_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out);
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/custom_ops/op_fast_hadamard_transform.cpp b/extension/llm/custom_ops/op_fast_hadamard_transform.cpp
new file mode 100644
index 00000000000..2d005ecd68d
--- /dev/null
+++ b/extension/llm/custom_ops/op_fast_hadamard_transform.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+#include <executorch/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h>
+#include <executorch/kernels/optimized/utils/llvmMathExtras.h>
+#include <executorch/kernels/portable/cpu/util/reduce_util.h> // For apply_over_dim.
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+Tensor& fast_hadamard_transform_out(
+    RuntimeContext& ctx,
+    const Tensor& mat,
+    Tensor& out) {
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, mat.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK(
+      ctx, mat.scalar_type() == out.scalar_type(), InvalidArgument, out);
+
+  if (mat.dim() == 0 || mat.numel() == 0) {
+    return out;
+  }
+
+  ET_KERNEL_CHECK(
+      ctx,
+      is_contiguous_dim_order(mat.dim_order().data(), mat.dim()),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      is_contiguous_dim_order(out.dim_order().data(), out.dim()),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      mat.strides().back() == 1,
+      InvalidArgument,
+      out,
+      "input matrix that isn't contiguous in the last dimension is not supported!");
+
+  const auto last_dim_size = mat.sizes().back();
+  const auto divisible_by_28 = last_dim_size % 28 == 0;
+  auto power_of_two_size = divisible_by_28 ? last_dim_size / 28 : last_dim_size;
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      (power_of_two_size & (power_of_two_size - 1)) == 0,
+      InvalidArgument,
+      out,
+      "This implementation requires power-of-2 (or power-of-2 * 28) input size in the last dimension!");
+
+  const auto log2_power_of_two_size = executorch::llvm::countTrailingZeros(
+      static_cast<unsigned int>(power_of_two_size),
+      executorch::llvm::ZeroBehavior::ZB_Undefined);
+
+  ET_SWITCH_FLOATH_TYPES(mat.scalar_type(), ctx, __func__, CTYPE, [&] {
+    const CTYPE* const mat_data = mat.const_data_ptr<CTYPE>();
+    CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
+
+    std::memcpy(out_data, mat_data, mat.numel() * sizeof(CTYPE));
+
+    if (divisible_by_28) {
+      apply_over_dim(
+          [log2_power_of_two_size, out_data](
+              const size_t size, const size_t stride, const size_t base) {
+            executorch::fast_hadamard_transform_28N(
+                out_data + base, log2_power_of_two_size);
+          },
+          out,
+          out.dim() - 1);
+    } else {
+      apply_over_dim(
+          [log2_power_of_two_size, out_data](
+              const size_t size, const size_t stride, const size_t base) {
+            executorch::fast_hadamard_transform(
+                out_data + base, log2_power_of_two_size);
+          },
+          out,
+          out.dim() - 1);
+    }
+  });
+  return out;
+}
+} // namespace native
+} // namespace executor
+} // namespace torch
+
+EXECUTORCH_LIBRARY(
+    llama,
+    "fast_hadamard_transform.out",
+    torch::executor::native::fast_hadamard_transform_out);
diff --git a/extension/llm/custom_ops/op_fast_hadamard_transform.h b/extension/llm/custom_ops/op_fast_hadamard_transform.h
new file mode 100644
index 00000000000..399401c3558
--- /dev/null
+++ b/extension/llm/custom_ops/op_fast_hadamard_transform.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch::executor::native {
+
+// Compute the fast Walsh-Hadamard transform
+// (https://en.wikipedia.org/wiki/Fast_Walsh%E2%80%93Hadamard_transform)
+// of mat along the last dimension (which must be contiguous).
+//
+// mat.sizes().back() is currently required to be either a power of
+// two, or 28 * a power of two.
+Tensor& fast_hadamard_transform_out(
+    RuntimeContext& ctx,
+    const Tensor& mat,
+    Tensor& out);
+} // namespace torch::executor::native
diff --git a/extension/llm/custom_ops/op_fast_hadamard_transform_aten.cpp b/extension/llm/custom_ops/op_fast_hadamard_transform_aten.cpp
new file mode 100644
index 00000000000..d2e4c01d25f
--- /dev/null
+++ b/extension/llm/custom_ops/op_fast_hadamard_transform_aten.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
+#include <executorch/extension/llm/custom_ops/op_fast_hadamard_transform.h>
+
+#include <torch/library.h>
+
+namespace torch::executor::native {
+namespace {
+Tensor& fast_hadamard_transform_out_no_context(const Tensor& vec, Tensor& out) {
+  exec_aten::RuntimeContext context;
+  return fast_hadamard_transform_out(context, vec, out);
+}
+at::Tensor fast_hadamard_transform_aten(const at::Tensor& vec) {
+  auto out = at::empty_like(vec);
+  WRAP_TO_ATEN(fast_hadamard_transform_out_no_context, 1)
+  (vec, out);
+  return out;
+}
+} // namespace
+} // namespace torch::executor::native
+
+TORCH_LIBRARY_FRAGMENT(llama, m) {
+  m.def("fast_hadamard_transform(Tensor mat) -> Tensor");
+  m.def(
+      "fast_hadamard_transform.out(Tensor mat, *, Tensor(a!) out) -> Tensor(a!)");
+}
+
+TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) {
+  m.impl(
+      "fast_hadamard_transform",
+      torch::executor::native::fast_hadamard_transform_aten);
+  m.impl(
+      "fast_hadamard_transform.out",
+      WRAP_TO_ATEN(
+          torch::executor::native::fast_hadamard_transform_out_no_context, 1));
+}
diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
index 727c04774b9..e8a53a41312 100644
--- a/extension/llm/custom_ops/op_sdpa.cpp
+++ b/extension/llm/custom_ops/op_sdpa.cpp
@@ -20,8 +20,8 @@
 #include <vector>
 
 #ifdef ET_USE_THREADPOOL
-#include <executorch/backends/xnnpack/threadpool/threadpool.h>
 #include <executorch/extension/parallel/thread_parallel.h>
+#include <executorch/extension/threadpool/threadpool.h>
 #endif
 #include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
 
@@ -158,7 +158,7 @@ static inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) {
 template <
     typename scalar_t,
     typename std::enable_if_t<
-        torch::executor::is_reduced_floating_point<scalar_t>::value,
+        ::executorch::runtime::is_reduced_floating_point_v<scalar_t>,
         int> = 0>
 static inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) {
   (void)ptr;
@@ -247,7 +247,7 @@ void cpu_flash_attention(
       "KV_split_size must be greater than q_split_size");
 
   constexpr bool is_reduced_type =
-      torch::executor::is_reduced_floating_point<scalar_t>::value;
+      ::executorch::runtime::is_reduced_floating_point_v<scalar_t>;
 
   ET_CHECK_MSG(
       !is_reduced_type, "FlashAttention does not support reduced types.");
@@ -700,10 +700,23 @@ void update_cache(
     const Tensor& cache,
     int64_t start_pos,
     int64_t seq_length) { // NOLINT: unused parameter 'seq_length'
+  // 1) Cache shape should be [bs, max_seq_len, num heads, head dim]
+  // 2) projected_value shape should be [bs, seq_len, num heads, head dim]
+  // 3) We're updating the cache with projected_value, at position start_pos
+
+  ET_CHECK_MSG(
+      projected_value.size(0) == cache.size(0),
+      "projected_value batch size should be equal to the cache batch size.");
+  ET_CHECK_MSG(
+      projected_value.size(2) == cache.size(2),
+      "projected_value number of heads should be equal to the cache number of heads.");
   ET_CHECK_MSG(
-      projected_value.size(0) == 1,
-      "projected_value must have batch size of 1");
-  ET_CHECK_MSG(cache.size(0) == 1, "cache must have batch size of 1");
+      projected_value.size(3) == cache.size(3),
+      "projected_value embedding dimension should be equal to the cache embedding dimension.");
+  ET_CHECK_MSG(
+      projected_value.element_size() == cache.element_size(),
+      "projected_value data type size should be equal to the cache data type size.");
+
   ET_CHECK_MSG(
       is_contiguous_dim_order(
           projected_value.dim_order().data(), projected_value.dim()),
@@ -714,22 +727,37 @@ void update_cache(
   ET_CHECK_MSG(projected_value_data != nullptr, "projected_value data is null");
   ET_CHECK_MSG(cache_data, "cache data is null");
 
-  auto strides = cache.strides();
-  exec_aten::StridesType seq_dim_stride = strides[1];
-  exec_aten::SizesType pos_offset = start_pos * seq_dim_stride;
-  exec_aten::SizesType pos_offset_bytes =
-      pos_offset * projected_value.element_size();
-  exec_aten::SizesType num_bytes =
-      projected_value.numel() * projected_value.element_size();
-  // NOLINTNEXTLINE
-  std::memcpy(
-      (uint8_t*)cache_data + pos_offset_bytes, projected_value_data, num_bytes);
+  auto cache_strides = cache.strides();
+  exec_aten::StridesType cache_batch_dim_stride = cache_strides[0];
+  exec_aten::StridesType cache_seq_dim_stride = cache_strides[1];
+
+  auto value_strides = projected_value.strides();
+  exec_aten::StridesType value_batch_dim_stride = value_strides[0];
+
+  exec_aten::SizesType num_bytes_to_copy =
+      (projected_value.numel() / projected_value.size(0)) *
+      projected_value.element_size();
+
+  for (int64_t batch_line = 0; batch_line < projected_value.size(0);
+       ++batch_line) {
+    exec_aten::SizesType cache_pos_offset =
+        (batch_line * cache_batch_dim_stride +
+         start_pos * cache_seq_dim_stride) *
+        cache.element_size();
+    exec_aten::SizesType value_pos_offset =
+        (batch_line * value_batch_dim_stride) * cache.element_size();
+
+    std::memcpy(
+        (uint8_t*)cache_data + cache_pos_offset,
+        (uint8_t*)projected_value_data + value_pos_offset,
+        num_bytes_to_copy);
+  }
 }
 
 } // anonymous namespace
 
 Tensor& flash_attention_kernel_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
@@ -811,7 +839,7 @@ Tensor& flash_attention_kernel_out(
   @param[in] seq_len: Seq length. e.g. seq_len dim of q_projected.
 */
 Tensor& sdpa_with_kv_cache_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& q_projected,
     const Tensor& k_projected,
     const Tensor& v_projected,
@@ -859,6 +887,8 @@ Tensor& sdpa_with_kv_cache_out(
       sliced_key_dim_order.data(),
       util::kKVDim,
       sliced_key_strides.data());
+  // since the cache is sliced, the batch stride needs to stay the same.
+  sliced_key_strides[0] = key_cache.strides()[0];
   void* key_cache_data = key_cache.mutable_data_ptr();
   TensorImpl k_impl = TensorImpl(
       key_cache.scalar_type(),
@@ -883,6 +913,8 @@ Tensor& sdpa_with_kv_cache_out(
       sliced_value_dim_order.data(),
       util::kKVDim,
       sliced_value_strides.data());
+  // since the cache is sliced, the batch stride needs to stay the same.
+  sliced_value_strides[0] = value_cache.strides()[0];
   void* value_cache_data = value_cache.mutable_data_ptr();
   TensorImpl value_impl = TensorImpl(
       value_cache.scalar_type(),
diff --git a/extension/llm/custom_ops/op_sdpa.h b/extension/llm/custom_ops/op_sdpa.h
index fd130964ebb..ce969b013d2 100644
--- a/extension/llm/custom_ops/op_sdpa.h
+++ b/extension/llm/custom_ops/op_sdpa.h
@@ -16,7 +16,7 @@ namespace executor {
 namespace native {
 
 Tensor& sdpa_with_kv_cache_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& q_projected,
     const Tensor& k_projected,
     const Tensor& v_projected,
@@ -32,7 +32,7 @@ Tensor& sdpa_with_kv_cache_out(
     Tensor& output);
 
 Tensor& flash_attention_kernel_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
diff --git a/extension/llm/custom_ops/op_sdpa_aot.cpp b/extension/llm/custom_ops/op_sdpa_aot.cpp
index 3fc790af792..79a6fa4dd9e 100644
--- a/extension/llm/custom_ops/op_sdpa_aot.cpp
+++ b/extension/llm/custom_ops/op_sdpa_aot.cpp
@@ -16,7 +16,7 @@ namespace torch {
 namespace executor {
 
 namespace native {
-
+namespace {
 Tensor& sdpa_with_kv_cache_out_no_context(
     const Tensor& q_projected,
     const Tensor& k_projected,
@@ -33,7 +33,7 @@ Tensor& sdpa_with_kv_cache_out_no_context(
     // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
     const optional<double> scale,
     Tensor& output) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::native::sdpa_with_kv_cache_out(
       context,
       q_projected,
@@ -81,12 +81,12 @@ at::Tensor sdpa_with_kv_cache_aten(
    output);
   return output;
 }
-
+} // namespace
 } // namespace native
 } // namespace executor
 } // namespace torch
 
-TORCH_LIBRARY(llama, m) {
+TORCH_LIBRARY_FRAGMENT(llama, m) {
   m.def(
       "sdpa_with_kv_cache(Tensor query, Tensor key, Tensor value, Tensor(a!) key_cache, "
       "Tensor(b!) value_cache, SymInt start_pos, SymInt seq_len, Tensor? attn_mask=None, "
diff --git a/extension/llm/custom_ops/op_sdpa_test.cpp b/extension/llm/custom_ops/op_sdpa_test.cpp
index 116be2508d3..7d7a35b4f96 100644
--- a/extension/llm/custom_ops/op_sdpa_test.cpp
+++ b/extension/llm/custom_ops/op_sdpa_test.cpp
@@ -17,6 +17,7 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
+using executorch::runtime::testing::TensorFactory;
 
 exec_aten::Tensor op_scaled_dot_product_attention(
     const exec_aten::Tensor& query,
@@ -27,7 +28,7 @@ exec_aten::Tensor op_scaled_dot_product_attention(
     bool is_causal,
     exec_aten::optional<double> scale,
     exec_aten::Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::native::flash_attention_kernel_out(
       context, query, key, value, attn_mask, dropout_p, is_causal, scale, out);
 }
@@ -37,7 +38,7 @@ Most tests are generated by FACTO
 */
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_105) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 4, 4},
@@ -123,7 +124,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_105) {
 }
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_11) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 1, 8},
@@ -152,7 +153,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_11) {
 }
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_13) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 8, 1, 1}, {-47.0, 21.25, 74.75, 46.375, 21.0, -29.0, 2.625, 83.125});
@@ -181,7 +182,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_13) {
 }
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_17) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {3, 2, 2, 6},
@@ -257,7 +258,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_17) {
 }
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_18) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {3, 2, 2, 6},
@@ -333,7 +334,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_18) {
 // Disabling this test because right now we are enforcing that
 // attention mask must be 2D
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_19) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {3, 2, 2, 6},
@@ -479,7 +480,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_19) {
 */
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_51) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 8, 3},
diff --git a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
index 819dd702171..e53ddb97663 100644
--- a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
+++ b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
@@ -16,6 +16,7 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
+using executorch::runtime::testing::TensorFactory;
 
 exec_aten::Tensor op_sdpa_with_kv_cache(
     const exec_aten::Tensor& query,
@@ -30,7 +31,7 @@ exec_aten::Tensor op_sdpa_with_kv_cache(
     bool is_causal,
     exec_aten::optional<double> scale,
     exec_aten::Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::native::sdpa_with_kv_cache_out(
       context,
       query,
@@ -79,7 +80,7 @@ Missing tests:
 5. Different dtypes, fp16, bf16, double (or expect throw)
 */
 TEST(OpScaledDotProductAttentionTest, BasicTest) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 4, 4},
@@ -360,7 +361,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTest) {
 }
 
 TEST(OpScaledDotProductAttentionTest, LargerTest) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 7, 4}, {0.8823, 0.9150, 0.3829, 0.9593, 0.3904, 0.6009, 0.2566,
@@ -524,7 +525,7 @@ TEST(OpScaledDotProductAttentionTest, LargerTest) {
 }
 
 TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 4, 4},
@@ -807,7 +808,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
 }
 
 TEST(OpScaledDotProductAttentionTest, SequenceTest) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 8, 4},
diff --git a/extension/llm/custom_ops/op_tile_crop.cpp b/extension/llm/custom_ops/op_tile_crop.cpp
index 7c596665d77..439c6f8e6dd 100644
--- a/extension/llm/custom_ops/op_tile_crop.cpp
+++ b/extension/llm/custom_ops/op_tile_crop.cpp
@@ -74,7 +74,7 @@ void tile_crop_impl(const Tensor& in, int64_t tile_size, Tensor& out) {
 } // namespace
 
 Tensor& tile_crop_out_impl(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input, // NOLINT
     const int64_t tile_size, // NOLINT
     Tensor& out) {
diff --git a/extension/llm/custom_ops/op_tile_crop.h b/extension/llm/custom_ops/op_tile_crop.h
index 445e443f9af..abee6ebb776 100644
--- a/extension/llm/custom_ops/op_tile_crop.h
+++ b/extension/llm/custom_ops/op_tile_crop.h
@@ -16,7 +16,7 @@ namespace executor {
 namespace native {
 
 Tensor& tile_crop_out_impl(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     const int64_t tile_size,
     Tensor& out);
diff --git a/extension/llm/custom_ops/op_tile_crop_aot.cpp b/extension/llm/custom_ops/op_tile_crop_aot.cpp
new file mode 100644
index 00000000000..1755e543ebe
--- /dev/null
+++ b/extension/llm/custom_ops/op_tile_crop_aot.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+#include <executorch/extension/llm/custom_ops/op_tile_crop.h>
+
+#include <torch/library.h>
+
+namespace torch {
+namespace executor {
+
+namespace native {
+
+Tensor&
+tile_crop_out_no_context(const Tensor& input, int64_t tile_size, Tensor& out) {
+  exec_aten::RuntimeContext context{};
+  return tile_crop_out_impl(context, input, tile_size, out);
+}
+
+at::Tensor tile_crop_aten(const at::Tensor& input, int64_t tile_size) {
+  // max_num_tiles = 4, num_channels = 3.
+  auto output = at::empty({4, 3, tile_size, tile_size});
+
+  WRAP_TO_ATEN(torch::executor::native::tile_crop_out_no_context, 2)
+  (input, tile_size, output);
+  return output;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
+
+TORCH_LIBRARY(preprocess, m) {
+  m.def("tile_crop(Tensor input, int tile_size) -> Tensor");
+  m.def(
+      "tile_crop.out(Tensor input, int tile_size, *, Tensor(a!) out) -> Tensor(a!)");
+}
+
+TORCH_LIBRARY_IMPL(preprocess, CompositeExplicitAutograd, m) {
+  m.impl("tile_crop", torch::executor::native::tile_crop_aten);
+  m.impl(
+      "tile_crop.out",
+      WRAP_TO_ATEN(torch::executor::native::tile_crop_out_no_context, 2));
+}
diff --git a/extension/llm/custom_ops/op_tile_crop_aot.py b/extension/llm/custom_ops/op_tile_crop_aot.py
new file mode 100644
index 00000000000..701aabc441c
--- /dev/null
+++ b/extension/llm/custom_ops/op_tile_crop_aot.py
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from pathlib import Path
+
+import torch
+
+try:
+    tile_crop = torch.ops.preprocess.tile_crop.default
+    assert tile_crop is not None
+except:
+    libs = list(Path(__file__).parent.resolve().glob("libcustom_ops_aot_lib.*"))
+    assert len(libs) == 1, f"Expected 1 library but got {len(libs)}"
+    logging.info(f"Loading custom ops library: {libs[0]}")
+    torch.ops.load_library(libs[0])
+    tile_crop = torch.ops.preprocess.tile_crop.default
+    assert tile_crop is not None
+
+preprocess_ops_lib = torch.library.Library("preprocess", "IMPL")
+
+MAX_NUM_TILES = 4
+
+
+# Register meta kernel to prevent export tracing into the tile_crop impl.
+@torch.library.register_fake("preprocess::tile_crop")
+def tile_crop(output: torch.Tensor, tile_size: int) -> torch.Tensor:
+    # Returned tensor is of size [n, 3, 224, 224], where n = number of tiles.
+    # Use an unbacked symint to create an upper-bounded dynamic shape output.
+    # Otherwise, output is set to a static shape, and we can only output
+    # tensors of shape [MAX_NUM_TILES, 3, 224, 224].
+    ctx = torch._custom_ops.get_ctx()
+    s0 = ctx.create_unbacked_symint()
+    torch._constrain_as_size(s0, 0, MAX_NUM_TILES)
+    return torch.empty([s0, output.size(0), tile_size, tile_size])
diff --git a/extension/llm/custom_ops/op_tile_crop_test.cpp b/extension/llm/custom_ops/op_tile_crop_test.cpp
index 565f510913a..36841b80f1c 100644
--- a/extension/llm/custom_ops/op_tile_crop_test.cpp
+++ b/extension/llm/custom_ops/op_tile_crop_test.cpp
@@ -15,7 +15,7 @@
 using namespace ::testing;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using torch::executor::testing::TensorFactory;
+using executorch::runtime::testing::TensorFactory;
 
 class OpTileCropOutTest : public OperatorTest {
  protected:
diff --git a/extension/llm/custom_ops/preprocess_custom_ops.py b/extension/llm/custom_ops/preprocess_custom_ops.py
index aea8c09b0ef..f1e05697a41 100644
--- a/extension/llm/custom_ops/preprocess_custom_ops.py
+++ b/extension/llm/custom_ops/preprocess_custom_ops.py
@@ -7,64 +7,18 @@
 # pyre-unsafe
 
 
-from typing import List
-
 import torch
 
 from torch.library import impl, Library
 
 preprocess_op_lib = Library("preprocess", "DEF")
 
-# Register and define pad and out variant.
-# Note: pad doesn't require an explicit meta kernel because
-# CompositeExplicitAutograd automatically registers the implementation to meta,
-# and meta kernels do not go through functionalization. The implementation
-# does not export due to issues during functionalization.
-# See: https://github.com/pytorch/pytorch/issues/120288
-preprocess_op_lib.define("pad(Tensor image, SymInt[] padding) -> Tensor")
-
-
-@impl(preprocess_op_lib, "pad", dispatch_key="CompositeExplicitAutograd")
-def pad_impl(
-    image: torch.Tensor,
-    padding: List[int],
-) -> torch.Tensor:
-    output = torch.empty(
-        [image.shape[0], image.shape[1] + padding[3], image.shape[2] + padding[1]],
-        dtype=image.dtype,
-        device=image.device,
-        requires_grad=False,
-    )
-    output = torch.fill(output, 0)
-    output.narrow(1, 0, image.shape[1]).narrow(2, 0, image.shape[2]).copy_(image)
-    return output
-
-
-preprocess_op_lib.define(
-    "pad.out(Tensor image, SymInt[] padding, *, Tensor(a!) out) -> Tensor(a!)"
-)
-
-
-@impl(preprocess_op_lib, "pad.out", dispatch_key="CompositeExplicitAutograd")
-def pad_out_impl(
-    image: torch.Tensor,
-    padding: List[int],
-    out: torch.Tensor,
-) -> torch.Tensor:
-    out = torch.empty(
-        [image.shape[0], image.shape[1] + padding[3], image.shape[2] + padding[1]],
-        dtype=image.dtype,
-        device=image.device,
-        requires_grad=False,
-    )
-    out = torch.fill(out, 0)
-    out.narrow(1, 0, image.shape[1]).narrow(2, 0, image.shape[2]).copy_(image)
-    return out
-
-
 # Register and define tile_crop and out variant.
 preprocess_op_lib.define("tile_crop(Tensor input, int tile_size) -> Tensor")
 
+# Keep this in sync with model config.
+MAX_NUM_TILES = 4
+
 
 @impl(preprocess_op_lib, "tile_crop", dispatch_key="CompositeExplicitAutograd")
 def tile_crop_impl(input: torch.Tensor, tile_size: int) -> torch.Tensor:
@@ -105,6 +59,11 @@ def tile_crop_out_impl(
 # Register meta kernel to prevent export tracing into the tile_crop impl.
 @torch.library.register_fake("preprocess::tile_crop")
 def tile_crop(output: torch.Tensor, tile_size: int) -> torch.Tensor:
-    # Returned tensor is of size [n, 3, 224, 224], where n is the number of tiles.
-    # We should export with n = max_num_tiles. Set 50 for now.
-    return torch.empty([50, output.size(0), 224, 224])
+    # Returned tensor is of size [n, 3, 224, 224], where n = number of tiles.
+    # Use an unbacked symint to create an upper-bounded dynamic shape output.
+    # Otherwise, output is set to a static shape, and we can only output
+    # tensors of shape [MAX_NUM_TILES, 3, 224, 224].
+    ctx = torch._custom_ops.get_ctx()
+    s0 = ctx.create_unbacked_symint()
+    torch._constrain_as_size(s0, 0, MAX_NUM_TILES)
+    return torch.empty([s0, output.size(0), tile_size, tile_size])
diff --git a/extension/llm/custom_ops/sdpa_with_kv_cache.py b/extension/llm/custom_ops/sdpa_with_kv_cache.py
index 7673f64d924..3de034fa6b5 100644
--- a/extension/llm/custom_ops/sdpa_with_kv_cache.py
+++ b/extension/llm/custom_ops/sdpa_with_kv_cache.py
@@ -20,6 +20,8 @@
 try:
     op = torch.ops.llama.sdpa_with_kv_cache.default
     assert op is not None
+    op2 = torch.ops.llama.fast_hadamard_transform.default
+    assert op2 is not None
 except:
     libs = list(Path(__file__).parent.resolve().glob("libcustom_ops_aot_lib.*"))
     assert len(libs) == 1, f"Expected 1 library but got {len(libs)}"
@@ -27,6 +29,8 @@
     torch.ops.load_library(libs[0])
     op = torch.ops.llama.sdpa_with_kv_cache.default
     assert op is not None
+    op2 = torch.ops.llama.fast_hadamard_transform.default
+    assert op2 is not None
 
 custom_ops_lib = torch.library.Library("llama", "IMPL")
 
@@ -126,3 +130,11 @@ def sdpa_with_kv_cache_meta(
     )
 
     return torch.empty_like(query)
+
+
+@impl(custom_ops_lib, "fast_hadamard_transform", "Meta")
+def fast_hadamard_transform_meta(mat):
+    # assert(mat.strides[-1] == 1, "input matrix must be contiguous in the last dimension!")
+    # assert(mat.shape[-1] == 128 or mat.shape[-1] == 14336, "unexpected input size for llama3 demo!")
+    # assert(mat.is_contiguous(), "input matrix must be contiguous currently!")
+    return torch.empty_like(mat)
diff --git a/extension/llm/custom_ops/spinquant/README.md b/extension/llm/custom_ops/spinquant/README.md
new file mode 100644
index 00000000000..e946e0ee60e
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/README.md
@@ -0,0 +1,16 @@
+# SpinQuant
+
+This is an implementation of the [Fast Hadamard
+Transform](https://en.wikipedia.org/wiki/Fast_Walsh–Hadamard_transform)
+as used in [SpinQuant](https://arxiv.org/abs/2405.16406) (for the R3
+and R4 matrices), [QuaRot](https://arxiv.org/abs/2404.00456), and
+[Quip#](https://arxiv.org/pdf/2402.04396). We follow those papers'
+method (as implemented in
+https://github.com/Dao-AILab/fast-hadamard-transform/) for extending
+the transform to non-power-of-two input sizes. CUDA is not considered
+because https://github.com/Dao-AILab/fast-hadamard-transform/ is
+already available.
+
+The intended long-term destination for this code is pytorch/ao; it is
+in ExecuTorch temporarily until we get C++ dependency from ExecuTorch
+on torchao figured out.
diff --git a/extension/llm/custom_ops/spinquant/TARGETS b/extension/llm/custom_ops/spinquant/TARGETS
new file mode 100644
index 00000000000..0a42614a385
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/llm/custom_ops/spinquant/fast_hadamard_transform.cpp b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.cpp
new file mode 100644
index 00000000000..dd34e8da852
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fast_hadamard_transform.h"
+
+#include <algorithm>
+
+namespace executorch {
+namespace {
+// Normalization step: divide by sqrt(1 << log2_vec_size). Similar
+// to fast_sqrt above, if N is even, then the maximum-precision way
+// to do this is right-shift by log2_vec_size / 2. If N is odd, we
+// still do the right-shift, and then we have an extra division by
+// sqrt(2) that we perform by making use of a sufficiently accurate
+// rational approximation. Our initial idea was to divide by sqrt(2)
+// by adjusting the quantization scale, but that would cause this
+// function to tend to increase the magnitude of the elements of
+// vec, which would resulting in clipping and therefore accuracy
+// loss, especially compounded over 30+ transformer layers.
+void quantized_normalize_after_fht(
+    const int32_t* tmp,
+    int16_t* out,
+    int log2_vec_size,
+    int vec_size) {
+  const int log2_sqrt_vec_size = log2_vec_size / 2;
+  constexpr int32_t qmin = -(1 << 15) + 1;
+  constexpr int32_t qmax = -qmin;
+  if (log2_vec_size % 2 != 0) {
+    // 408 / 577 - 1.0 / sqrt(2) ~= 1.062e-0.6, which should be close enough.
+    static const int32_t inv_sqrt_2_numerator = 408;
+    static const int32_t inv_sqrt_2_denominator = 577;
+    for (int ii = 0; ii < vec_size; ++ii) {
+      const auto val_over_sqrt_vec_size =
+          (tmp[ii] * inv_sqrt_2_numerator / inv_sqrt_2_denominator) >>
+          log2_sqrt_vec_size;
+      out[ii] = std::clamp(val_over_sqrt_vec_size, qmin, qmax);
+    }
+  } else {
+    for (int ii = 0; ii < vec_size; ++ii) {
+      out[ii] = std::clamp(tmp[ii] >> log2_sqrt_vec_size, qmin, qmax);
+    }
+  }
+}
+} // namespace
+
+void fast_hadamard_transform_symmetric_quantized_s16(
+    int16_t* vec,
+    int log2_vec_size) {
+  if (log2_vec_size == 0) {
+    return;
+  }
+
+  const int vec_size = 1 << log2_vec_size;
+  // We perform log2_vec_size rounds where each round's maximum output
+  // is at most double the maximum input, so we can at most multiply
+  // the maximum input by vec_size. Performing intermediate arithmetic
+  // in 32-bit precision should prevent overflow, since 16 +
+  // log2_vec_size should be much less than 32.
+  auto tmp = std::make_unique<int32_t[]>(vec_size);
+  std::copy(vec, vec + vec_size, tmp.get());
+
+  // Per the function-level comment above, we can ignore the
+  // quantization scale, so we just delegate to the usual unnormalized
+  // implementation.
+  // NOTE: if we need this to be fast on CPU, we can use FFHT to
+  // generate fht_uint32 similar to fht_float.
+  internal::fast_hadamard_transform_unnormalized_simple_impl(
+      tmp.get(), log2_vec_size);
+
+  quantized_normalize_after_fht(tmp.get(), vec, log2_vec_size, vec_size);
+}
+
+void fast_hadamard_transform_symmetric_quantized_s16_28N(
+    int16_t* vec,
+    int log2_vec_size) {
+  if (log2_vec_size == 0) {
+    return;
+  }
+  const int vec_size = (1 << log2_vec_size);
+
+  auto tmp = std::make_unique<int32_t[]>(vec_size * 28);
+  std::copy(vec, vec + vec_size * 28, tmp.get());
+
+  for (int ii = 0; ii < 28; ++ii) {
+    internal::fast_hadamard_transform_unnormalized_simple_impl(
+        &tmp[ii * vec_size], log2_vec_size);
+  }
+
+  for (int ii = 0; ii < vec_size; ++ii) {
+    hadamard_mult_28_strided(&tmp[ii], vec_size);
+  }
+
+  quantized_normalize_after_fht(tmp.get(), vec, log2_vec_size, vec_size * 28);
+}
+
+} // namespace executorch
diff --git a/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h
new file mode 100644
index 00000000000..712b7787683
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// (c) Meta Platforms, Inc. and affiliates.
+#pragma once
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <memory>
+
+#include <executorch/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h>
+
+#include "fast_hadamard_transform_special.h"
+
+namespace executorch {
+namespace internal {
+
+// Square root of 1 << log2_n.
+template <typename T>
+T fast_sqrt_of_power_of_2(int log2_n) {
+  // The square root of 2**N is, by definition, 2**(N/2), which is
+  // trivial to compute for even N using a left shift.
+  //
+  // For odd N, 2**(N/2) = 2**(floor(N/2) + 1/2)
+  //                     = 2**(floor(N/2)) * (2 ** (1/2))
+  //                     = 2**(floor(N/2)) * sqrt(2)
+  // which is again fast to compute.
+  return T(1 << (log2_n / 2)) * ((log2_n % 2) ? T(std::sqrt(2)) : T(1));
+}
+
+template <typename T>
+void normalize_after_fht(T* out, int log2_vec_size) {
+  const T inv_sqrt = T(1) / fast_sqrt_of_power_of_2<T>(log2_vec_size);
+  const int vec_size = 1 << log2_vec_size;
+  for (int ii = 0; ii < vec_size; ++ii) {
+    out[ii] *= inv_sqrt;
+  }
+}
+
+template <typename T>
+void fast_hadamard_transform_unnormalized_simple_impl(
+    T* vec,
+    int log2_vec_size) {
+  // NOTE: If you're here because you're profiling a model and this is
+  // slow, consider updating FFHT to generate efficient assembly for
+  // your data type!
+  if (log2_vec_size == 0) {
+    return;
+  }
+
+  int step = 1;
+  const auto vec_size = 1 << log2_vec_size;
+  while (step < vec_size) {
+    for (int ii = 0; ii < vec_size; ii += step * 2) {
+      for (int jj = ii; jj < ii + step; ++jj) {
+        auto x = vec[jj];
+        auto y = vec[jj + step];
+        vec[jj] = x + y;
+        vec[jj + step] = x - y;
+      }
+    }
+    step *= 2;
+  }
+}
+
+template <typename T>
+void fast_hadamard_transform_simple_impl(T* vec, int log2_vec_size) {
+  fast_hadamard_transform_unnormalized_simple_impl(vec, log2_vec_size);
+  normalize_after_fht(vec, log2_vec_size);
+}
+
+inline void fast_hadamard_transform_ffht_impl(float* vec, int log2_vec_size) {
+#if defined(__aarch64__) || defined(__x86_64__)
+  if (log2_vec_size <= 0) {
+    return;
+  }
+
+  fht_float(vec, log2_vec_size);
+  normalize_after_fht(vec, log2_vec_size);
+#else
+  fast_hadamard_transform_simple_impl(vec, log2_vec_size);
+#endif
+}
+
+} // namespace internal
+
+// Compute the fast Walsh-Hadamard transform
+// (https://en.wikipedia.org/wiki/Fast_Walsh%E2%80%93Hadamard_transform)
+// of vec, which must be of length (1 << log2_vec_size).
+template <typename T>
+void fast_hadamard_transform(T* vec, int log2_vec_size) {
+  if constexpr (std::is_same_v<T, float>) {
+    internal::fast_hadamard_transform_ffht_impl(vec, log2_vec_size);
+  } else {
+    internal::fast_hadamard_transform_simple_impl(vec, log2_vec_size);
+  }
+}
+
+// Compute a quantized fast Walsh-Hadamard transform of vec, which
+// must be of length (1 << log2_vec_size) and symmetrically quantized.
+//
+// Note that we do not need to know the quantization scale, because
+// the Fast Hadamard transform is a series of additions and
+// subtractions with a final multiplication step, and we have the
+// following trivial identities:
+//
+// scale * a + scale * b = scale * (a + b)  (addition doesn't need the scale)
+// alpha * (scale * a) = scale * (alpha * a) (multiplication doesn't need the
+// scale)
+void fast_hadamard_transform_symmetric_quantized_s16(
+    int16_t* vec,
+    int log2_vec_size);
+
+// Like fast_hadamard_transform, but vec must be of length 28 * (1 <<
+// log2_vec_size) and the transform is computed by interpreting vec as
+// a (28, 1 << log2_vec_size) matrix and performing 28 FHTs, followed
+// by (1 << log2_vec_size) multiplications by a particular Hadamard
+// matrix of size 28x28 (see special_hadamard_code_gen.py for the
+// exact matrix).
+template <typename T>
+void fast_hadamard_transform_28N(T* vec, int log2_vec_size) {
+  const int vec_size = (1 << log2_vec_size);
+  for (int ii = 0; ii < 28; ++ii) {
+    fast_hadamard_transform(&vec[ii * vec_size], log2_vec_size);
+  }
+  for (int ii = 0; ii < vec_size; ++ii) {
+    hadamard_mult_28_strided(&vec[ii], vec_size);
+  }
+}
+
+// We don't need the quantization scale; see the function-level
+// comment on fast_hadamard_transform_symmetric_quantized_s16 for
+// details.
+void fast_hadamard_transform_symmetric_quantized_s16_28N(
+    int16_t* vec,
+    int log2_vec_size);
+
+} // namespace executorch
diff --git a/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h b/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h
new file mode 100644
index 00000000000..ca5a8d61e73
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h
@@ -0,0 +1,241 @@
+// @generated by special_hadamard_code_gen.py strided_cpu
+
+
+#pragma once
+
+
+template <typename T>
+void hadamard_mult_12_strided(T* input, int stride) {
+    T x[12];
+    T out[12];
+    x[0] = input[0 * stride];
+    x[1] = input[1 * stride];
+    x[2] = input[2 * stride];
+    x[3] = input[3 * stride];
+    x[4] = input[4 * stride];
+    x[5] = input[5 * stride];
+    x[6] = input[6 * stride];
+    x[7] = input[7 * stride];
+    x[8] = input[8 * stride];
+    x[9] = input[9 * stride];
+    x[10] = input[10 * stride];
+    x[11] = input[11 * stride];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11];
+    out[1] = + x[0] + x[1] - x[2] + x[3] - x[4] - x[5] - x[6] + x[7] + x[8] + x[9] - x[10] + x[11];
+    out[2] = + x[0] + x[1] + x[2] - x[3] + x[4] - x[5] - x[6] - x[7] + x[8] + x[9] + x[10] - x[11];
+    out[3] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] - x[6] - x[7] - x[8] + x[9] + x[10] + x[11];
+    out[4] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] + x[6] - x[7] - x[8] - x[9] + x[10] + x[11];
+    out[5] = + x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] - x[9] - x[10] + x[11];
+    out[6] = + x[0] + x[1] + x[2] + x[3] - x[4] + x[5] + x[6] - x[7] + x[8] - x[9] - x[10] - x[11];
+    out[7] = + x[0] - x[1] + x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] + x[9] - x[10] - x[11];
+    out[8] = + x[0] - x[1] - x[2] + x[3] + x[4] + x[5] - x[6] + x[7] + x[8] - x[9] + x[10] - x[11];
+    out[9] = + x[0] - x[1] - x[2] - x[3] + x[4] + x[5] + x[6] - x[7] + x[8] + x[9] - x[10] + x[11];
+    out[10] = + x[0] + x[1] - x[2] - x[3] - x[4] + x[5] + x[6] + x[7] - x[8] + x[9] + x[10] - x[11];
+    out[11] = + x[0] - x[1] + x[2] - x[3] - x[4] - x[5] + x[6] + x[7] + x[8] - x[9] + x[10] + x[11];
+    #pragma unroll
+    for (int ii = 0; ii < 12; ++ii) { input[stride * ii] = out[ii]; }
+}
+
+
+template <typename T>
+void hadamard_mult_20_strided(T* input, int stride) {
+    T x[20];
+    T out[20];
+    x[0] = input[0 * stride];
+    x[1] = input[1 * stride];
+    x[2] = input[2 * stride];
+    x[3] = input[3 * stride];
+    x[4] = input[4 * stride];
+    x[5] = input[5 * stride];
+    x[6] = input[6 * stride];
+    x[7] = input[7 * stride];
+    x[8] = input[8 * stride];
+    x[9] = input[9 * stride];
+    x[10] = input[10 * stride];
+    x[11] = input[11 * stride];
+    x[12] = input[12 * stride];
+    x[13] = input[13 * stride];
+    x[14] = input[14 * stride];
+    x[15] = input[15 * stride];
+    x[16] = input[16 * stride];
+    x[17] = input[17 * stride];
+    x[18] = input[18 * stride];
+    x[19] = input[19 * stride];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] - x[7] - x[8] - x[9] + x[10] + x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19];
+    out[1] = - x[0] + x[1] - x[2] - x[3] - x[4] - x[5] + x[6] - x[7] - x[8] - x[9] + x[10] + x[11] + x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] + x[19];
+    out[2] = - x[0] - x[1] + x[2] - x[3] - x[4] - x[5] - x[6] + x[7] - x[8] - x[9] - x[10] + x[11] + x[12] + x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19];
+    out[3] = - x[0] - x[1] - x[2] + x[3] - x[4] - x[5] - x[6] - x[7] + x[8] - x[9] - x[10] - x[11] + x[12] + x[13] + x[14] + x[15] + x[16] - x[17] + x[18] - x[19];
+    out[4] = - x[0] - x[1] - x[2] - x[3] + x[4] - x[5] - x[6] - x[7] - x[8] + x[9] + x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] + x[17] - x[18] + x[19];
+    out[5] = - x[0] + x[1] + x[2] + x[3] + x[4] + x[5] - x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] - x[13] + x[14] + x[15] + x[16] - x[17] - x[18] + x[19];
+    out[6] = + x[0] - x[1] + x[2] + x[3] + x[4] - x[5] + x[6] - x[7] - x[8] - x[9] + x[10] - x[11] + x[12] - x[13] - x[14] + x[15] + x[16] + x[17] - x[18] - x[19];
+    out[7] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] + x[7] - x[8] - x[9] - x[10] + x[11] - x[12] + x[13] - x[14] - x[15] + x[16] + x[17] + x[18] - x[19];
+    out[8] = + x[0] + x[1] + x[2] - x[3] + x[4] - x[5] - x[6] - x[7] + x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] - x[16] + x[17] + x[18] + x[19];
+    out[9] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] - x[8] + x[9] + x[10] - x[11] - x[12] + x[13] - x[14] + x[15] - x[16] - x[17] + x[18] + x[19];
+    out[10] = - x[0] - x[1] + x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] - x[9] + x[10] - x[11] - x[12] - x[13] - x[14] - x[15] + x[16] + x[17] + x[18] + x[19];
+    out[11] = - x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] - x[10] + x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] + x[18] + x[19];
+    out[12] = + x[0] - x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] + x[9] - x[10] - x[11] + x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19];
+    out[13] = + x[0] + x[1] - x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] + x[15] + x[16] + x[17] - x[18] + x[19];
+    out[14] = - x[0] + x[1] + x[2] - x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] + x[15] + x[16] + x[17] + x[18] - x[19];
+    out[15] = - x[0] + x[1] - x[2] - x[3] + x[4] - x[5] - x[6] + x[7] + x[8] - x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] - x[17] - x[18] - x[19];
+    out[16] = + x[0] - x[1] + x[2] - x[3] - x[4] - x[5] - x[6] - x[7] + x[8] + x[9] - x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] - x[17] - x[18] - x[19];
+    out[17] = - x[0] + x[1] - x[2] + x[3] - x[4] + x[5] - x[6] - x[7] - x[8] + x[9] - x[10] - x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] - x[18] - x[19];
+    out[18] = - x[0] - x[1] + x[2] - x[3] + x[4] + x[5] + x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19];
+    out[19] = + x[0] - x[1] - x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19];
+    #pragma unroll
+    for (int ii = 0; ii < 20; ++ii) { input[stride * ii] = out[ii]; }
+}
+
+
+template <typename T>
+void hadamard_mult_28_strided(T* input, int stride) {
+    T x[28];
+    T out[28];
+    x[0] = input[0 * stride];
+    x[1] = input[1 * stride];
+    x[2] = input[2 * stride];
+    x[3] = input[3 * stride];
+    x[4] = input[4 * stride];
+    x[5] = input[5 * stride];
+    x[6] = input[6 * stride];
+    x[7] = input[7 * stride];
+    x[8] = input[8 * stride];
+    x[9] = input[9 * stride];
+    x[10] = input[10 * stride];
+    x[11] = input[11 * stride];
+    x[12] = input[12 * stride];
+    x[13] = input[13 * stride];
+    x[14] = input[14 * stride];
+    x[15] = input[15 * stride];
+    x[16] = input[16 * stride];
+    x[17] = input[17 * stride];
+    x[18] = input[18 * stride];
+    x[19] = input[19 * stride];
+    x[20] = input[20 * stride];
+    x[21] = input[21 * stride];
+    x[22] = input[22 * stride];
+    x[23] = input[23 * stride];
+    x[24] = input[24 * stride];
+    x[25] = input[25 * stride];
+    x[26] = input[26 * stride];
+    x[27] = input[27 * stride];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] + x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] - x[17] - x[18] + x[19] - x[20] + x[21] - x[22] - x[23] + x[24] + x[25] - x[26] - x[27];
+    out[1] = - x[0] + x[1] - x[2] - x[3] - x[4] - x[5] - x[6] + x[7] + x[8] + x[9] - x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] - x[19] + x[20] - x[21] + x[22] - x[23] - x[24] + x[25] + x[26] - x[27];
+    out[2] = - x[0] - x[1] + x[2] - x[3] - x[4] - x[5] - x[6] - x[7] + x[8] + x[9] + x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] - x[17] + x[18] - x[19] - x[20] - x[21] - x[22] + x[23] - x[24] - x[25] + x[26] + x[27];
+    out[3] = - x[0] - x[1] - x[2] + x[3] - x[4] - x[5] - x[6] - x[7] - x[8] + x[9] + x[10] + x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19] - x[20] + x[21] - x[22] - x[23] + x[24] - x[25] - x[26] + x[27];
+    out[4] = - x[0] - x[1] - x[2] - x[3] + x[4] - x[5] - x[6] - x[7] - x[8] - x[9] + x[10] + x[11] + x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] - x[19] + x[20] + x[21] + x[22] - x[23] - x[24] + x[25] - x[26] - x[27];
+    out[5] = - x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] - x[7] - x[8] - x[9] - x[10] + x[11] + x[12] + x[13] + x[14] - x[15] - x[16] + x[17] - x[18] + x[19] - x[20] - x[21] + x[22] + x[23] - x[24] - x[25] + x[26] - x[27];
+    out[6] = - x[0] - x[1] - x[2] - x[3] - x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] + x[12] + x[13] - x[14] + x[15] - x[16] - x[17] + x[18] - x[19] + x[20] - x[21] - x[22] + x[23] + x[24] - x[25] - x[26] + x[27];
+    out[7] = - x[0] - x[1] + x[2] + x[3] + x[4] + x[5] - x[6] + x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] - x[14] + x[15] + x[16] - x[17] - x[18] + x[19] + x[20] + x[21] - x[22] + x[23] - x[24] - x[25] + x[26] - x[27];
+    out[8] = - x[0] - x[1] - x[2] + x[3] + x[4] + x[5] + x[6] - x[7] + x[8] - x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] + x[17] - x[18] - x[19] + x[20] - x[21] + x[22] - x[23] + x[24] - x[25] - x[26] + x[27];
+    out[9] = + x[0] - x[1] - x[2] - x[3] + x[4] + x[5] + x[6] - x[7] - x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19] - x[20] + x[21] - x[22] + x[23] - x[24] + x[25] - x[26] - x[27];
+    out[10] = + x[0] + x[1] - x[2] - x[3] - x[4] + x[5] + x[6] - x[7] - x[8] - x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19] - x[20] - x[21] + x[22] - x[23] + x[24] - x[25] + x[26] - x[27];
+    out[11] = + x[0] + x[1] + x[2] - x[3] - x[4] - x[5] + x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] + x[17] - x[18] + x[19] + x[20] - x[21] - x[22] + x[23] - x[24] + x[25] - x[26] + x[27];
+    out[12] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] - x[16] + x[17] + x[18] - x[19] + x[20] + x[21] - x[22] - x[23] + x[24] - x[25] + x[26] - x[27];
+    out[13] = - x[0] + x[1] + x[2] + x[3] + x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] + x[13] + x[14] + x[15] - x[16] - x[17] + x[18] + x[19] - x[20] - x[21] + x[22] - x[23] - x[24] + x[25] - x[26] + x[27];
+    out[14] = - x[0] + x[1] - x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] + x[10] + x[11] - x[12] - x[13] + x[14] - x[15] - x[16] - x[17] - x[18] - x[19] - x[20] - x[21] - x[22] + x[23] + x[24] + x[25] + x[26] - x[27];
+    out[15] = + x[0] - x[1] + x[2] - x[3] + x[4] + x[5] - x[6] - x[7] + x[8] - x[9] - x[10] + x[11] + x[12] - x[13] - x[14] + x[15] - x[16] - x[17] - x[18] - x[19] - x[20] - x[21] - x[22] - x[23] + x[24] + x[25] + x[26] + x[27];
+    out[16] = - x[0] + x[1] - x[2] + x[3] - x[4] + x[5] + x[6] - x[7] - x[8] + x[9] - x[10] - x[11] + x[12] + x[13] - x[14] - x[15] + x[16] - x[17] - x[18] - x[19] - x[20] + x[21] - x[22] - x[23] - x[24] + x[25] + x[26] + x[27];
+    out[17] = + x[0] - x[1] + x[2] - x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] + x[10] - x[11] - x[12] + x[13] - x[14] - x[15] - x[16] + x[17] - x[18] - x[19] - x[20] + x[21] + x[22] - x[23] - x[24] - x[25] + x[26] + x[27];
+    out[18] = + x[0] + x[1] - x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] - x[9] - x[10] + x[11] - x[12] - x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19] - x[20] + x[21] + x[22] + x[23] - x[24] - x[25] - x[26] + x[27];
+    out[19] = - x[0] + x[1] + x[2] - x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] - x[10] - x[11] + x[12] - x[13] - x[14] - x[15] - x[16] - x[17] - x[18] + x[19] - x[20] + x[21] + x[22] + x[23] + x[24] - x[25] - x[26] - x[27];
+    out[20] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] - x[6] - x[7] - x[8] + x[9] + x[10] - x[11] - x[12] + x[13] - x[14] - x[15] - x[16] - x[17] - x[18] - x[19] + x[20] - x[21] + x[22] + x[23] + x[24] + x[25] - x[26] - x[27];
+    out[21] = - x[0] + x[1] + x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] - x[9] + x[10] + x[11] - x[12] + x[13] + x[14] + x[15] - x[16] - x[17] - x[18] - x[19] + x[20] + x[21] - x[22] - x[23] - x[24] - x[25] - x[26] - x[27];
+    out[22] = + x[0] - x[1] + x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] - x[10] + x[11] + x[12] - x[13] + x[14] + x[15] + x[16] - x[17] - x[18] - x[19] - x[20] - x[21] + x[22] - x[23] - x[24] - x[25] - x[26] - x[27];
+    out[23] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] - x[7] + x[8] - x[9] + x[10] - x[11] + x[12] + x[13] - x[14] + x[15] + x[16] + x[17] - x[18] - x[19] - x[20] - x[21] - x[22] + x[23] - x[24] - x[25] - x[26] - x[27];
+    out[24] = - x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] + x[9] - x[10] + x[11] - x[12] + x[13] - x[14] - x[15] + x[16] + x[17] + x[18] - x[19] - x[20] - x[21] - x[22] - x[23] + x[24] - x[25] - x[26] - x[27];
+    out[25] = - x[0] - x[1] + x[2] + x[3] - x[4] + x[5] + x[6] + x[7] + x[8] - x[9] + x[10] - x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] + x[18] + x[19] - x[20] - x[21] - x[22] - x[23] - x[24] + x[25] - x[26] - x[27];
+    out[26] = + x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] - x[10] + x[11] - x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] + x[19] + x[20] - x[21] - x[22] - x[23] - x[24] - x[25] + x[26] - x[27];
+    out[27] = + x[0] + x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] + x[9] + x[10] - x[11] + x[12] - x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19] + x[20] - x[21] - x[22] - x[23] - x[24] - x[25] - x[26] + x[27];
+    #pragma unroll
+    for (int ii = 0; ii < 28; ++ii) { input[stride * ii] = out[ii]; }
+}
+
+
+template <typename T>
+void hadamard_mult_40_strided(T* input, int stride) {
+    T x[40];
+    T out[40];
+    x[0] = input[0 * stride];
+    x[1] = input[1 * stride];
+    x[2] = input[2 * stride];
+    x[3] = input[3 * stride];
+    x[4] = input[4 * stride];
+    x[5] = input[5 * stride];
+    x[6] = input[6 * stride];
+    x[7] = input[7 * stride];
+    x[8] = input[8 * stride];
+    x[9] = input[9 * stride];
+    x[10] = input[10 * stride];
+    x[11] = input[11 * stride];
+    x[12] = input[12 * stride];
+    x[13] = input[13 * stride];
+    x[14] = input[14 * stride];
+    x[15] = input[15 * stride];
+    x[16] = input[16 * stride];
+    x[17] = input[17 * stride];
+    x[18] = input[18 * stride];
+    x[19] = input[19 * stride];
+    x[20] = input[20 * stride];
+    x[21] = input[21 * stride];
+    x[22] = input[22 * stride];
+    x[23] = input[23 * stride];
+    x[24] = input[24 * stride];
+    x[25] = input[25 * stride];
+    x[26] = input[26 * stride];
+    x[27] = input[27 * stride];
+    x[28] = input[28 * stride];
+    x[29] = input[29 * stride];
+    x[30] = input[30 * stride];
+    x[31] = input[31 * stride];
+    x[32] = input[32 * stride];
+    x[33] = input[33 * stride];
+    x[34] = input[34 * stride];
+    x[35] = input[35 * stride];
+    x[36] = input[36 * stride];
+    x[37] = input[37 * stride];
+    x[38] = input[38 * stride];
+    x[39] = input[39 * stride];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] - x[14] - x[15] - x[16] - x[17] - x[18] - x[19] + x[20] - x[21] - x[22] - x[23] - x[24] - x[25] - x[26] - x[27] - x[28] - x[29] - x[30] - x[31] - x[32] - x[33] - x[34] - x[35] - x[36] - x[37] - x[38] - x[39];
+    out[1] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] - x[7] - x[8] + x[9] - x[10] + x[11] - x[12] + x[13] + x[14] + x[15] + x[16] - x[17] - x[18] + x[19] + x[20] + x[21] - x[22] + x[23] + x[24] - x[25] - x[26] - x[27] - x[28] + x[29] - x[30] + x[31] - x[32] + x[33] + x[34] + x[35] + x[36] - x[37] - x[38] + x[39];
+    out[2] = + x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] - x[7] - x[8] - x[9] + x[10] - x[11] + x[12] - x[13] + x[14] + x[15] + x[16] + x[17] - x[18] - x[19] + x[20] + x[21] + x[22] - x[23] + x[24] + x[25] - x[26] - x[27] - x[28] - x[29] + x[30] - x[31] + x[32] - x[33] + x[34] + x[35] + x[36] + x[37] - x[38] - x[39];
+    out[3] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] + x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] + x[13] - x[14] + x[15] + x[16] + x[17] + x[18] - x[19] + x[20] - x[21] + x[22] + x[23] - x[24] + x[25] + x[26] - x[27] - x[28] - x[29] - x[30] + x[31] - x[32] + x[33] - x[34] + x[35] + x[36] + x[37] + x[38] - x[39];
+    out[4] = + x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] + x[16] + x[17] + x[18] + x[19] + x[20] - x[21] - x[22] + x[23] + x[24] - x[25] + x[26] + x[27] - x[28] - x[29] - x[30] - x[31] + x[32] - x[33] + x[34] - x[35] + x[36] + x[37] + x[38] + x[39];
+    out[5] = + x[0] + x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] + x[15] - x[16] + x[17] + x[18] + x[19] + x[20] + x[21] - x[22] - x[23] + x[24] + x[25] - x[26] + x[27] + x[28] - x[29] - x[30] - x[31] - x[32] + x[33] - x[34] + x[35] - x[36] + x[37] + x[38] + x[39];
+    out[6] = + x[0] + x[1] + x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] - x[17] + x[18] + x[19] + x[20] + x[21] + x[22] - x[23] - x[24] + x[25] + x[26] - x[27] + x[28] + x[29] - x[30] - x[31] - x[32] - x[33] + x[34] - x[35] + x[36] - x[37] + x[38] + x[39];
+    out[7] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19] + x[20] + x[21] + x[22] + x[23] - x[24] - x[25] + x[26] + x[27] - x[28] + x[29] + x[30] - x[31] - x[32] - x[33] - x[34] + x[35] - x[36] + x[37] - x[38] + x[39];
+    out[8] = + x[0] + x[1] + x[2] + x[3] + x[4] - x[5] - x[6] + x[7] + x[8] - x[9] + x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] - x[19] + x[20] + x[21] + x[22] + x[23] + x[24] - x[25] - x[26] + x[27] + x[28] - x[29] + x[30] + x[31] - x[32] - x[33] - x[34] - x[35] + x[36] - x[37] + x[38] - x[39];
+    out[9] = + x[0] - x[1] + x[2] + x[3] + x[4] + x[5] - x[6] - x[7] + x[8] + x[9] - x[10] + x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] - x[18] + x[19] + x[20] - x[21] + x[22] + x[23] + x[24] + x[25] - x[26] - x[27] + x[28] + x[29] - x[30] + x[31] + x[32] - x[33] - x[34] - x[35] - x[36] + x[37] - x[38] + x[39];
+    out[10] = + x[0] + x[1] - x[2] + x[3] + x[4] + x[5] + x[6] - x[7] - x[8] + x[9] + x[10] - x[11] + x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19] + x[20] + x[21] - x[22] + x[23] + x[24] + x[25] + x[26] - x[27] - x[28] + x[29] + x[30] - x[31] + x[32] + x[33] - x[34] - x[35] - x[36] - x[37] + x[38] - x[39];
+    out[11] = + x[0] - x[1] + x[2] - x[3] + x[4] + x[5] + x[6] + x[7] - x[8] - x[9] + x[10] + x[11] - x[12] + x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19] + x[20] - x[21] + x[22] - x[23] + x[24] + x[25] + x[26] + x[27] - x[28] - x[29] + x[30] + x[31] - x[32] + x[33] + x[34] - x[35] - x[36] - x[37] - x[38] + x[39];
+    out[12] = + x[0] + x[1] - x[2] + x[3] - x[4] + x[5] + x[6] + x[7] + x[8] - x[9] - x[10] + x[11] + x[12] - x[13] + x[14] + x[15] - x[16] - x[17] - x[18] - x[19] + x[20] + x[21] - x[22] + x[23] - x[24] + x[25] + x[26] + x[27] + x[28] - x[29] - x[30] + x[31] + x[32] - x[33] + x[34] + x[35] - x[36] - x[37] - x[38] - x[39];
+    out[13] = + x[0] - x[1] + x[2] - x[3] + x[4] - x[5] + x[6] + x[7] + x[8] + x[9] - x[10] - x[11] + x[12] + x[13] - x[14] + x[15] + x[16] - x[17] - x[18] - x[19] + x[20] - x[21] + x[22] - x[23] + x[24] - x[25] + x[26] + x[27] + x[28] + x[29] - x[30] - x[31] + x[32] + x[33] - x[34] + x[35] + x[36] - x[37] - x[38] - x[39];
+    out[14] = + x[0] - x[1] - x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] + x[9] + x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] + x[17] - x[18] - x[19] + x[20] - x[21] - x[22] + x[23] - x[24] + x[25] - x[26] + x[27] + x[28] + x[29] + x[30] - x[31] - x[32] + x[33] + x[34] - x[35] + x[36] + x[37] - x[38] - x[39];
+    out[15] = + x[0] - x[1] - x[2] - x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] + x[10] + x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19] + x[20] - x[21] - x[22] - x[23] + x[24] - x[25] + x[26] - x[27] + x[28] + x[29] + x[30] + x[31] - x[32] - x[33] + x[34] + x[35] - x[36] + x[37] + x[38] - x[39];
+    out[16] = + x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] + x[7] - x[8] + x[9] + x[10] + x[11] + x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19] + x[20] - x[21] - x[22] - x[23] - x[24] + x[25] - x[26] + x[27] - x[28] + x[29] + x[30] + x[31] + x[32] - x[33] - x[34] + x[35] + x[36] - x[37] + x[38] + x[39];
+    out[17] = + x[0] + x[1] - x[2] - x[3] - x[4] - x[5] + x[6] - x[7] + x[8] - x[9] + x[10] + x[11] + x[12] + x[13] - x[14] - x[15] + x[16] + x[17] - x[18] + x[19] + x[20] + x[21] - x[22] - x[23] - x[24] - x[25] + x[26] - x[27] + x[28] - x[29] + x[30] + x[31] + x[32] + x[33] - x[34] - x[35] + x[36] + x[37] - x[38] + x[39];
+    out[18] = + x[0] + x[1] + x[2] - x[3] - x[4] - x[5] - x[6] + x[7] - x[8] + x[9] - x[10] + x[11] + x[12] + x[13] + x[14] - x[15] - x[16] + x[17] + x[18] - x[19] + x[20] + x[21] + x[22] - x[23] - x[24] - x[25] - x[26] + x[27] - x[28] + x[29] - x[30] + x[31] + x[32] + x[33] + x[34] - x[35] - x[36] + x[37] + x[38] - x[39];
+    out[19] = + x[0] - x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] + x[8] - x[9] + x[10] - x[11] + x[12] + x[13] + x[14] + x[15] - x[16] - x[17] + x[18] + x[19] + x[20] - x[21] + x[22] + x[23] - x[24] - x[25] - x[26] - x[27] + x[28] - x[29] + x[30] - x[31] + x[32] + x[33] + x[34] + x[35] - x[36] - x[37] + x[38] + x[39];
+    out[20] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] - x[14] - x[15] - x[16] - x[17] - x[18] - x[19] - x[20] + x[21] + x[22] + x[23] + x[24] + x[25] + x[26] + x[27] + x[28] + x[29] + x[30] + x[31] + x[32] + x[33] + x[34] + x[35] + x[36] + x[37] + x[38] + x[39];
+    out[21] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] - x[7] - x[8] + x[9] - x[10] + x[11] - x[12] + x[13] + x[14] + x[15] + x[16] - x[17] - x[18] + x[19] - x[20] - x[21] + x[22] - x[23] - x[24] + x[25] + x[26] + x[27] + x[28] - x[29] + x[30] - x[31] + x[32] - x[33] - x[34] - x[35] - x[36] + x[37] + x[38] - x[39];
+    out[22] = + x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] - x[7] - x[8] - x[9] + x[10] - x[11] + x[12] - x[13] + x[14] + x[15] + x[16] + x[17] - x[18] - x[19] - x[20] - x[21] - x[22] + x[23] - x[24] - x[25] + x[26] + x[27] + x[28] + x[29] - x[30] + x[31] - x[32] + x[33] - x[34] - x[35] - x[36] - x[37] + x[38] + x[39];
+    out[23] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] + x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] + x[13] - x[14] + x[15] + x[16] + x[17] + x[18] - x[19] - x[20] + x[21] - x[22] - x[23] + x[24] - x[25] - x[26] + x[27] + x[28] + x[29] + x[30] - x[31] + x[32] - x[33] + x[34] - x[35] - x[36] - x[37] - x[38] + x[39];
+    out[24] = + x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] + x[16] + x[17] + x[18] + x[19] - x[20] + x[21] + x[22] - x[23] - x[24] + x[25] - x[26] - x[27] + x[28] + x[29] + x[30] + x[31] - x[32] + x[33] - x[34] + x[35] - x[36] - x[37] - x[38] - x[39];
+    out[25] = + x[0] + x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] + x[15] - x[16] + x[17] + x[18] + x[19] - x[20] - x[21] + x[22] + x[23] - x[24] - x[25] + x[26] - x[27] - x[28] + x[29] + x[30] + x[31] + x[32] - x[33] + x[34] - x[35] + x[36] - x[37] - x[38] - x[39];
+    out[26] = + x[0] + x[1] + x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] - x[17] + x[18] + x[19] - x[20] - x[21] - x[22] + x[23] + x[24] - x[25] - x[26] + x[27] - x[28] - x[29] + x[30] + x[31] + x[32] + x[33] - x[34] + x[35] - x[36] + x[37] - x[38] - x[39];
+    out[27] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19] - x[20] - x[21] - x[22] - x[23] + x[24] + x[25] - x[26] - x[27] + x[28] - x[29] - x[30] + x[31] + x[32] + x[33] + x[34] - x[35] + x[36] - x[37] + x[38] - x[39];
+    out[28] = + x[0] + x[1] + x[2] + x[3] + x[4] - x[5] - x[6] + x[7] + x[8] - x[9] + x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] - x[19] - x[20] - x[21] - x[22] - x[23] - x[24] + x[25] + x[26] - x[27] - x[28] + x[29] - x[30] - x[31] + x[32] + x[33] + x[34] + x[35] - x[36] + x[37] - x[38] + x[39];
+    out[29] = + x[0] - x[1] + x[2] + x[3] + x[4] + x[5] - x[6] - x[7] + x[8] + x[9] - x[10] + x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] - x[18] + x[19] - x[20] + x[21] - x[22] - x[23] - x[24] - x[25] + x[26] + x[27] - x[28] - x[29] + x[30] - x[31] - x[32] + x[33] + x[34] + x[35] + x[36] - x[37] + x[38] - x[39];
+    out[30] = + x[0] + x[1] - x[2] + x[3] + x[4] + x[5] + x[6] - x[7] - x[8] + x[9] + x[10] - x[11] + x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19] - x[20] - x[21] + x[22] - x[23] - x[24] - x[25] - x[26] + x[27] + x[28] - x[29] - x[30] + x[31] - x[32] - x[33] + x[34] + x[35] + x[36] + x[37] - x[38] + x[39];
+    out[31] = + x[0] - x[1] + x[2] - x[3] + x[4] + x[5] + x[6] + x[7] - x[8] - x[9] + x[10] + x[11] - x[12] + x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19] - x[20] + x[21] - x[22] + x[23] - x[24] - x[25] - x[26] - x[27] + x[28] + x[29] - x[30] - x[31] + x[32] - x[33] - x[34] + x[35] + x[36] + x[37] + x[38] - x[39];
+    out[32] = + x[0] + x[1] - x[2] + x[3] - x[4] + x[5] + x[6] + x[7] + x[8] - x[9] - x[10] + x[11] + x[12] - x[13] + x[14] + x[15] - x[16] - x[17] - x[18] - x[19] - x[20] - x[21] + x[22] - x[23] + x[24] - x[25] - x[26] - x[27] - x[28] + x[29] + x[30] - x[31] - x[32] + x[33] - x[34] - x[35] + x[36] + x[37] + x[38] + x[39];
+    out[33] = + x[0] - x[1] + x[2] - x[3] + x[4] - x[5] + x[6] + x[7] + x[8] + x[9] - x[10] - x[11] + x[12] + x[13] - x[14] + x[15] + x[16] - x[17] - x[18] - x[19] - x[20] + x[21] - x[22] + x[23] - x[24] + x[25] - x[26] - x[27] - x[28] - x[29] + x[30] + x[31] - x[32] - x[33] + x[34] - x[35] - x[36] + x[37] + x[38] + x[39];
+    out[34] = + x[0] - x[1] - x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] + x[9] + x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] + x[17] - x[18] - x[19] - x[20] + x[21] + x[22] - x[23] + x[24] - x[25] + x[26] - x[27] - x[28] - x[29] - x[30] + x[31] + x[32] - x[33] - x[34] + x[35] - x[36] - x[37] + x[38] + x[39];
+    out[35] = + x[0] - x[1] - x[2] - x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] + x[10] + x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19] - x[20] + x[21] + x[22] + x[23] - x[24] + x[25] - x[26] + x[27] - x[28] - x[29] - x[30] - x[31] + x[32] + x[33] - x[34] - x[35] + x[36] - x[37] - x[38] + x[39];
+    out[36] = + x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] + x[7] - x[8] + x[9] + x[10] + x[11] + x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19] - x[20] + x[21] + x[22] + x[23] + x[24] - x[25] + x[26] - x[27] + x[28] - x[29] - x[30] - x[31] - x[32] + x[33] + x[34] - x[35] - x[36] + x[37] - x[38] - x[39];
+    out[37] = + x[0] + x[1] - x[2] - x[3] - x[4] - x[5] + x[6] - x[7] + x[8] - x[9] + x[10] + x[11] + x[12] + x[13] - x[14] - x[15] + x[16] + x[17] - x[18] + x[19] - x[20] - x[21] + x[22] + x[23] + x[24] + x[25] - x[26] + x[27] - x[28] + x[29] - x[30] - x[31] - x[32] - x[33] + x[34] + x[35] - x[36] - x[37] + x[38] - x[39];
+    out[38] = + x[0] + x[1] + x[2] - x[3] - x[4] - x[5] - x[6] + x[7] - x[8] + x[9] - x[10] + x[11] + x[12] + x[13] + x[14] - x[15] - x[16] + x[17] + x[18] - x[19] - x[20] - x[21] - x[22] + x[23] + x[24] + x[25] + x[26] - x[27] + x[28] - x[29] + x[30] - x[31] - x[32] - x[33] - x[34] + x[35] + x[36] - x[37] - x[38] + x[39];
+    out[39] = + x[0] - x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] + x[8] - x[9] + x[10] - x[11] + x[12] + x[13] + x[14] + x[15] - x[16] - x[17] + x[18] + x[19] - x[20] + x[21] - x[22] - x[23] + x[24] + x[25] + x[26] + x[27] - x[28] + x[29] - x[30] + x[31] - x[32] - x[33] - x[34] - x[35] + x[36] + x[37] - x[38] - x[39];
+    #pragma unroll
+    for (int ii = 0; ii < 40; ++ii) { input[stride * ii] = out[ii]; }
+}
+
diff --git a/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py b/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py
new file mode 100644
index 00000000000..a8b9feb0785
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py
@@ -0,0 +1,279 @@
+# Portions (c) Meta Platforms, Inc. and affiliates.
+# This file is adapted from
+# https://github.com/Dao-AILab/fast-hadamard-transform/blob/master/csrc/code_gen.py .
+
+# BSD 3-Clause License
+
+# Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from pathlib import Path
+
+import numpy as np
+
+# From https://en.wikipedia.org/wiki/Paley_construction (construction II for q = 5)
+
+had_12_paley = """
++-++++++++++
+--+-+-+-+-+-
++++-++----++
++---+--+-++-
++++++-++----
++-+---+--+-+
+++--+++-++--
++--++---+--+
+++----+++-++
++--+-++---+-
+++++----+++-
++-+--+-++---
+"""
+
+# From http://neilsloane.com/hadamard/
+
+had_12 = """
++-----------
+++-+---+++-+
++++-+---+++-
++-++-+---+++
+++-++-+---++
++++-++-+---+
+++++-++-+---
++-+++-++-+--
++--+++-++-+-
++---+++-++-+
+++---+++-++-
++-+---+++-++
+"""
+
+had_20_will = """
++----+----++--++-++-
+-+----+---+++---+-++
+--+----+---+++-+-+-+
+---+----+---+++++-+-
+----+----++--++-++-+
+-+++++-----+--+++--+
++-+++-+---+-+--+++--
+++-++--+---+-+--+++-
++++-+---+---+-+--+++
+++++-----++--+-+--++
+--++-+-++-+-----++++
+---++-+-++-+---+-+++
++---++-+-+--+--++-++
+++---++-+----+-+++-+
+-++---++-+----+++++-
+-+--+--++-+----+----
++-+-----++-+----+---
+-+-+-+---+--+----+--
+--+-+++------+----+-
++--+--++------+----+
+"""
+
+
+had_28_will = """
++------++----++-+--+-+--++--
+-+-----+++-----+-+--+-+--++-
+--+-----+++---+-+-+----+--++
+---+-----+++---+-+-+-+--+--+
+----+-----+++---+-+-+++--+--
+-----+-----++++--+-+--++--+-
+------++----++-+--+-+--++--+
+--++++-+-------++--+++-+--+-
+---++++-+-----+-++--+-+-+--+
++---+++--+----++-++--+-+-+--
+++---++---+----++-++--+-+-+-
++++---+----+----++-++--+-+-+
+++++--------+-+--++-++--+-+-
+-++++--------+++--++--+--+-+
+-+-++-++--++--+--------++++-
++-+-++--+--++--+--------++++
+-+-+-++--+--++--+----+---+++
++-+-+-++--+--+---+---++---++
+++-+-+-++--+------+--+++---+
+-++-+-+-++--+------+-++++---
++-++-+---++--+------+-++++--
+-++--++-+-++-+++----++------
++-++--++-+-++-+++-----+-----
+++-++---+-+-++-+++-----+----
+-++-++-+-+-+-+--+++-----+---
+--++-++++-+-+----+++-----+--
++--++-+-++-+-+----+++-----+-
+++--++-+-++-+-+----++------+
+"""
+
+
+had_40_tpal = """
++-------------------+-------------------
+++-++----+-+-++++--+++-++----+-+-++++--+
++++-++----+-+-++++--+++-++----+-+-++++--
++-++-++----+-+-++++-+-++-++----+-+-++++-
++--++-++----+-+-+++++--++-++----+-+-++++
+++--++-++----+-+-+++++--++-++----+-+-+++
++++--++-++----+-+-+++++--++-++----+-+-++
+++++--++-++----+-+-+++++--++-++----+-+-+
++++++--++-++----+-+-+++++--++-++----+-+-
++-++++--++-++----+-++-++++--++-++----+-+
+++-++++--++-++----+-++-++++--++-++----+-
++-+-++++--++-++----++-+-++++--++-++----+
+++-+-++++--++-++----++-+-++++--++-++----
++-+-+-++++--++-++---+-+-+-++++--++-++---
++--+-+-++++--++-++--+--+-+-++++--++-++--
++---+-+-++++--++-++-+---+-+-++++--++-++-
++----+-+-++++--++-+++----+-+-++++--++-++
+++----+-+-++++--++-+++----+-+-++++--++-+
++++----+-+-++++--++-+++----+-+-++++--++-
++-++----+-+-++++--+++-++----+-+-++++--++
++--------------------+++++++++++++++++++
+++-++----+-+-++++--+--+--++++-+-+----++-
++++-++----+-+-++++-----+--++++-+-+----++
++-++-++----+-+-++++--+--+--++++-+-+----+
++--++-++----+-+-++++-++--+--++++-+-+----
+++--++-++----+-+-+++--++--+--++++-+-+---
++++--++-++----+-+-++---++--+--++++-+-+--
+++++--++-++----+-+-+----++--+--++++-+-+-
++++++--++-++----+-+------++--+--++++-+-+
++-++++--++-++----+-+-+----++--+--++++-+-
+++-++++--++-++----+---+----++--+--++++-+
++-+-++++--++-++----+-+-+----++--+--++++-
+++-+-++++--++-++------+-+----++--+--++++
++-+-+-++++--++-++----+-+-+----++--+--+++
++--+-+-++++--++-++---++-+-+----++--+--++
++---+-+-++++--++-++--+++-+-+----++--+--+
++----+-+-++++--++-++-++++-+-+----++--+--
+++----+-+-++++--++-+--++++-+-+----++--+-
++++----+-+-++++--++----++++-+-+----++--+
++-++----+-+-++++--++-+--++++-+-+----++--
+"""
+
+# NOTE: the original Dao-AILab/fast-hadamard-transform uses had_12_paley rather than
+# had_12 here. However, SpinQuant and QuaRot seem to use had_12, so we follow them here.
+had_strings = [had_12, had_20_will, had_28_will, had_40_tpal]
+
+header = """
+
+#pragma once
+
+"""
+
+
+TEMPLATE = """
+__device__ __forceinline__ void hadamard_mult_thread_{N}(float x[{N}]) {{
+    float out[{N}];
+    {code}
+    #pragma unroll
+    for (int i = 0; i < {N}; i++) {{ x[i] = out[i]; }}
+}}
+
+"""
+
+
+CPU_TEMPLATE = """
+template <typename T>
+void hadamard_mult_{N}(T* x) {{
+    float out[{N}];
+    {code}
+    #pragma unroll
+    for (int i = 0; i < {N}; i++) {{ x[i] = out[i]; }}
+}}
+
+"""
+
+STRIDED_CPU_TEMPLATE = """
+template <typename T>
+void hadamard_mult_{N}_strided(T* input, int stride) {{
+    T x[{N}];
+    T out[{N}];
+    {strided_load_code}
+    {code}
+    #pragma unroll
+    for (int ii = 0; ii < {N}; ++ii) {{ input[stride * ii] = out[ii]; }}
+}}
+
+"""
+
+
+def string_to_array(string):
+    # Convert strings of + and - to bool arrays
+    string = string.strip().replace("+", "1").replace("-", "-1").split()
+    return np.stack(
+        [
+            np.fromstring(" ".join(string[i]), dtype=np.int32, sep=" ")
+            for i in range(len(string))
+        ]
+    )
+
+
+def strided_load_code_gen(N):
+    return "\n    ".join([f"x[{i}] = input[{i} * stride];" for i in range(N)])
+
+
+def array_code_gen(arr, template):
+    N = arr.shape[0]
+    assert arr.shape[0] == arr.shape[1]
+    out = []
+    for i in range(N):
+        out.append(
+            f"out[{i}] = "
+            + " ".join([f"{'+' if arr[i, j] == 1 else '-'} x[{j}]" for j in range(N)])
+            + ";"
+        )
+    return template.format(
+        N=str(N), code="\n    ".join(out), strided_load_code=strided_load_code_gen(N)
+    )
+
+
+OPTION_TO_TEMPLATE = {
+    "cuda": TEMPLATE,
+    "cpu": CPU_TEMPLATE,
+    "strided_cpu": STRIDED_CPU_TEMPLATE,
+}
+
+
+def main(option="cuda"):
+    try:
+        template = OPTION_TO_TEMPLATE[option]
+    except KeyError:
+        raise Exception(
+            f"bad target option {option}; options are {', '.join(OPTION_TO_TEMPLATE.keys())}"
+        )
+    output_dir = Path(__file__).parent / "fast_hadamard_transform_special.h"
+    generated_line = f"// @{'generated'} by special_hadamard_code_gen.py {option}\n"
+
+    output_dir.write_text(
+        generated_line
+        + header
+        + "".join(array_code_gen(string_to_array(s), template) for s in had_strings)
+    )
+
+
+if __name__ == "__main__":
+    import sys
+
+    option = "cuda"
+    if len(sys.argv) > 1:
+        option = sys.argv[1]
+    main(option)
diff --git a/extension/llm/custom_ops/spinquant/targets.bzl b/extension/llm/custom_ops/spinquant/targets.bzl
new file mode 100644
index 00000000000..e87af3b80d8
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/targets.bzl
@@ -0,0 +1,22 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    runtime.cxx_library(
+        name = "fast_hadamard_transform",
+        exported_headers = [
+            "fast_hadamard_transform.h",
+            "fast_hadamard_transform_special.h",
+        ],
+        srcs = [
+            "fast_hadamard_transform.cpp",
+        ],
+        exported_deps = [
+            "//executorch/extension/llm/custom_ops/spinquant/third-party/FFHT:fht",
+        ],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+    )
diff --git a/extension/llm/custom_ops/spinquant/test/TARGETS b/extension/llm/custom_ops/spinquant/test/TARGETS
new file mode 100644
index 00000000000..0a42614a385
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h
new file mode 100644
index 00000000000..c0b27809598
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h
@@ -0,0 +1,137 @@
+// @generated by special_hadamard_code_gen.py cpu
+
+
+#pragma once
+
+
+template <typename T>
+void hadamard_mult_12(T* x) {
+    float out[12];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11];
+    out[1] = + x[0] + x[1] - x[2] + x[3] - x[4] - x[5] - x[6] + x[7] + x[8] + x[9] - x[10] + x[11];
+    out[2] = + x[0] + x[1] + x[2] - x[3] + x[4] - x[5] - x[6] - x[7] + x[8] + x[9] + x[10] - x[11];
+    out[3] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] - x[6] - x[7] - x[8] + x[9] + x[10] + x[11];
+    out[4] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] + x[6] - x[7] - x[8] - x[9] + x[10] + x[11];
+    out[5] = + x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] - x[9] - x[10] + x[11];
+    out[6] = + x[0] + x[1] + x[2] + x[3] - x[4] + x[5] + x[6] - x[7] + x[8] - x[9] - x[10] - x[11];
+    out[7] = + x[0] - x[1] + x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] + x[9] - x[10] - x[11];
+    out[8] = + x[0] - x[1] - x[2] + x[3] + x[4] + x[5] - x[6] + x[7] + x[8] - x[9] + x[10] - x[11];
+    out[9] = + x[0] - x[1] - x[2] - x[3] + x[4] + x[5] + x[6] - x[7] + x[8] + x[9] - x[10] + x[11];
+    out[10] = + x[0] + x[1] - x[2] - x[3] - x[4] + x[5] + x[6] + x[7] - x[8] + x[9] + x[10] - x[11];
+    out[11] = + x[0] - x[1] + x[2] - x[3] - x[4] - x[5] + x[6] + x[7] + x[8] - x[9] + x[10] + x[11];
+    #pragma unroll
+    for (int i = 0; i < 12; i++) { x[i] = out[i]; }
+}
+
+
+template <typename T>
+void hadamard_mult_20(T* x) {
+    float out[20];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] - x[7] - x[8] - x[9] + x[10] + x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19];
+    out[1] = - x[0] + x[1] - x[2] - x[3] - x[4] - x[5] + x[6] - x[7] - x[8] - x[9] + x[10] + x[11] + x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] + x[19];
+    out[2] = - x[0] - x[1] + x[2] - x[3] - x[4] - x[5] - x[6] + x[7] - x[8] - x[9] - x[10] + x[11] + x[12] + x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19];
+    out[3] = - x[0] - x[1] - x[2] + x[3] - x[4] - x[5] - x[6] - x[7] + x[8] - x[9] - x[10] - x[11] + x[12] + x[13] + x[14] + x[15] + x[16] - x[17] + x[18] - x[19];
+    out[4] = - x[0] - x[1] - x[2] - x[3] + x[4] - x[5] - x[6] - x[7] - x[8] + x[9] + x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] + x[17] - x[18] + x[19];
+    out[5] = - x[0] + x[1] + x[2] + x[3] + x[4] + x[5] - x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] - x[13] + x[14] + x[15] + x[16] - x[17] - x[18] + x[19];
+    out[6] = + x[0] - x[1] + x[2] + x[3] + x[4] - x[5] + x[6] - x[7] - x[8] - x[9] + x[10] - x[11] + x[12] - x[13] - x[14] + x[15] + x[16] + x[17] - x[18] - x[19];
+    out[7] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] + x[7] - x[8] - x[9] - x[10] + x[11] - x[12] + x[13] - x[14] - x[15] + x[16] + x[17] + x[18] - x[19];
+    out[8] = + x[0] + x[1] + x[2] - x[3] + x[4] - x[5] - x[6] - x[7] + x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] - x[16] + x[17] + x[18] + x[19];
+    out[9] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] - x[8] + x[9] + x[10] - x[11] - x[12] + x[13] - x[14] + x[15] - x[16] - x[17] + x[18] + x[19];
+    out[10] = - x[0] - x[1] + x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] - x[9] + x[10] - x[11] - x[12] - x[13] - x[14] - x[15] + x[16] + x[17] + x[18] + x[19];
+    out[11] = - x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] - x[10] + x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] + x[18] + x[19];
+    out[12] = + x[0] - x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] + x[9] - x[10] - x[11] + x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19];
+    out[13] = + x[0] + x[1] - x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] + x[15] + x[16] + x[17] - x[18] + x[19];
+    out[14] = - x[0] + x[1] + x[2] - x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] + x[15] + x[16] + x[17] + x[18] - x[19];
+    out[15] = - x[0] + x[1] - x[2] - x[3] + x[4] - x[5] - x[6] + x[7] + x[8] - x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] - x[17] - x[18] - x[19];
+    out[16] = + x[0] - x[1] + x[2] - x[3] - x[4] - x[5] - x[6] - x[7] + x[8] + x[9] - x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] - x[17] - x[18] - x[19];
+    out[17] = - x[0] + x[1] - x[2] + x[3] - x[4] + x[5] - x[6] - x[7] - x[8] + x[9] - x[10] - x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] - x[18] - x[19];
+    out[18] = - x[0] - x[1] + x[2] - x[3] + x[4] + x[5] + x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19];
+    out[19] = + x[0] - x[1] - x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19];
+    #pragma unroll
+    for (int i = 0; i < 20; i++) { x[i] = out[i]; }
+}
+
+
+template <typename T>
+void hadamard_mult_28(T* x) {
+    float out[28];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] + x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] - x[17] - x[18] + x[19] - x[20] + x[21] - x[22] - x[23] + x[24] + x[25] - x[26] - x[27];
+    out[1] = - x[0] + x[1] - x[2] - x[3] - x[4] - x[5] - x[6] + x[7] + x[8] + x[9] - x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] - x[19] + x[20] - x[21] + x[22] - x[23] - x[24] + x[25] + x[26] - x[27];
+    out[2] = - x[0] - x[1] + x[2] - x[3] - x[4] - x[5] - x[6] - x[7] + x[8] + x[9] + x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] - x[17] + x[18] - x[19] - x[20] - x[21] - x[22] + x[23] - x[24] - x[25] + x[26] + x[27];
+    out[3] = - x[0] - x[1] - x[2] + x[3] - x[4] - x[5] - x[6] - x[7] - x[8] + x[9] + x[10] + x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19] - x[20] + x[21] - x[22] - x[23] + x[24] - x[25] - x[26] + x[27];
+    out[4] = - x[0] - x[1] - x[2] - x[3] + x[4] - x[5] - x[6] - x[7] - x[8] - x[9] + x[10] + x[11] + x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] - x[19] + x[20] + x[21] + x[22] - x[23] - x[24] + x[25] - x[26] - x[27];
+    out[5] = - x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] - x[7] - x[8] - x[9] - x[10] + x[11] + x[12] + x[13] + x[14] - x[15] - x[16] + x[17] - x[18] + x[19] - x[20] - x[21] + x[22] + x[23] - x[24] - x[25] + x[26] - x[27];
+    out[6] = - x[0] - x[1] - x[2] - x[3] - x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] + x[12] + x[13] - x[14] + x[15] - x[16] - x[17] + x[18] - x[19] + x[20] - x[21] - x[22] + x[23] + x[24] - x[25] - x[26] + x[27];
+    out[7] = - x[0] - x[1] + x[2] + x[3] + x[4] + x[5] - x[6] + x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] - x[14] + x[15] + x[16] - x[17] - x[18] + x[19] + x[20] + x[21] - x[22] + x[23] - x[24] - x[25] + x[26] - x[27];
+    out[8] = - x[0] - x[1] - x[2] + x[3] + x[4] + x[5] + x[6] - x[7] + x[8] - x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] + x[17] - x[18] - x[19] + x[20] - x[21] + x[22] - x[23] + x[24] - x[25] - x[26] + x[27];
+    out[9] = + x[0] - x[1] - x[2] - x[3] + x[4] + x[5] + x[6] - x[7] - x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19] - x[20] + x[21] - x[22] + x[23] - x[24] + x[25] - x[26] - x[27];
+    out[10] = + x[0] + x[1] - x[2] - x[3] - x[4] + x[5] + x[6] - x[7] - x[8] - x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19] - x[20] - x[21] + x[22] - x[23] + x[24] - x[25] + x[26] - x[27];
+    out[11] = + x[0] + x[1] + x[2] - x[3] - x[4] - x[5] + x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] + x[17] - x[18] + x[19] + x[20] - x[21] - x[22] + x[23] - x[24] + x[25] - x[26] + x[27];
+    out[12] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] - x[16] + x[17] + x[18] - x[19] + x[20] + x[21] - x[22] - x[23] + x[24] - x[25] + x[26] - x[27];
+    out[13] = - x[0] + x[1] + x[2] + x[3] + x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] + x[13] + x[14] + x[15] - x[16] - x[17] + x[18] + x[19] - x[20] - x[21] + x[22] - x[23] - x[24] + x[25] - x[26] + x[27];
+    out[14] = - x[0] + x[1] - x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] + x[10] + x[11] - x[12] - x[13] + x[14] - x[15] - x[16] - x[17] - x[18] - x[19] - x[20] - x[21] - x[22] + x[23] + x[24] + x[25] + x[26] - x[27];
+    out[15] = + x[0] - x[1] + x[2] - x[3] + x[4] + x[5] - x[6] - x[7] + x[8] - x[9] - x[10] + x[11] + x[12] - x[13] - x[14] + x[15] - x[16] - x[17] - x[18] - x[19] - x[20] - x[21] - x[22] - x[23] + x[24] + x[25] + x[26] + x[27];
+    out[16] = - x[0] + x[1] - x[2] + x[3] - x[4] + x[5] + x[6] - x[7] - x[8] + x[9] - x[10] - x[11] + x[12] + x[13] - x[14] - x[15] + x[16] - x[17] - x[18] - x[19] - x[20] + x[21] - x[22] - x[23] - x[24] + x[25] + x[26] + x[27];
+    out[17] = + x[0] - x[1] + x[2] - x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] + x[10] - x[11] - x[12] + x[13] - x[14] - x[15] - x[16] + x[17] - x[18] - x[19] - x[20] + x[21] + x[22] - x[23] - x[24] - x[25] + x[26] + x[27];
+    out[18] = + x[0] + x[1] - x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] - x[9] - x[10] + x[11] - x[12] - x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19] - x[20] + x[21] + x[22] + x[23] - x[24] - x[25] - x[26] + x[27];
+    out[19] = - x[0] + x[1] + x[2] - x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] - x[10] - x[11] + x[12] - x[13] - x[14] - x[15] - x[16] - x[17] - x[18] + x[19] - x[20] + x[21] + x[22] + x[23] + x[24] - x[25] - x[26] - x[27];
+    out[20] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] - x[6] - x[7] - x[8] + x[9] + x[10] - x[11] - x[12] + x[13] - x[14] - x[15] - x[16] - x[17] - x[18] - x[19] + x[20] - x[21] + x[22] + x[23] + x[24] + x[25] - x[26] - x[27];
+    out[21] = - x[0] + x[1] + x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] - x[9] + x[10] + x[11] - x[12] + x[13] + x[14] + x[15] - x[16] - x[17] - x[18] - x[19] + x[20] + x[21] - x[22] - x[23] - x[24] - x[25] - x[26] - x[27];
+    out[22] = + x[0] - x[1] + x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] - x[10] + x[11] + x[12] - x[13] + x[14] + x[15] + x[16] - x[17] - x[18] - x[19] - x[20] - x[21] + x[22] - x[23] - x[24] - x[25] - x[26] - x[27];
+    out[23] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] - x[7] + x[8] - x[9] + x[10] - x[11] + x[12] + x[13] - x[14] + x[15] + x[16] + x[17] - x[18] - x[19] - x[20] - x[21] - x[22] + x[23] - x[24] - x[25] - x[26] - x[27];
+    out[24] = - x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] + x[9] - x[10] + x[11] - x[12] + x[13] - x[14] - x[15] + x[16] + x[17] + x[18] - x[19] - x[20] - x[21] - x[22] - x[23] + x[24] - x[25] - x[26] - x[27];
+    out[25] = - x[0] - x[1] + x[2] + x[3] - x[4] + x[5] + x[6] + x[7] + x[8] - x[9] + x[10] - x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] + x[18] + x[19] - x[20] - x[21] - x[22] - x[23] - x[24] + x[25] - x[26] - x[27];
+    out[26] = + x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] - x[10] + x[11] - x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] + x[19] + x[20] - x[21] - x[22] - x[23] - x[24] - x[25] + x[26] - x[27];
+    out[27] = + x[0] + x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] + x[9] + x[10] - x[11] + x[12] - x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19] + x[20] - x[21] - x[22] - x[23] - x[24] - x[25] - x[26] + x[27];
+    #pragma unroll
+    for (int i = 0; i < 28; i++) { x[i] = out[i]; }
+}
+
+
+template <typename T>
+void hadamard_mult_40(T* x) {
+    float out[40];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] - x[14] - x[15] - x[16] - x[17] - x[18] - x[19] + x[20] - x[21] - x[22] - x[23] - x[24] - x[25] - x[26] - x[27] - x[28] - x[29] - x[30] - x[31] - x[32] - x[33] - x[34] - x[35] - x[36] - x[37] - x[38] - x[39];
+    out[1] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] - x[7] - x[8] + x[9] - x[10] + x[11] - x[12] + x[13] + x[14] + x[15] + x[16] - x[17] - x[18] + x[19] + x[20] + x[21] - x[22] + x[23] + x[24] - x[25] - x[26] - x[27] - x[28] + x[29] - x[30] + x[31] - x[32] + x[33] + x[34] + x[35] + x[36] - x[37] - x[38] + x[39];
+    out[2] = + x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] - x[7] - x[8] - x[9] + x[10] - x[11] + x[12] - x[13] + x[14] + x[15] + x[16] + x[17] - x[18] - x[19] + x[20] + x[21] + x[22] - x[23] + x[24] + x[25] - x[26] - x[27] - x[28] - x[29] + x[30] - x[31] + x[32] - x[33] + x[34] + x[35] + x[36] + x[37] - x[38] - x[39];
+    out[3] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] + x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] + x[13] - x[14] + x[15] + x[16] + x[17] + x[18] - x[19] + x[20] - x[21] + x[22] + x[23] - x[24] + x[25] + x[26] - x[27] - x[28] - x[29] - x[30] + x[31] - x[32] + x[33] - x[34] + x[35] + x[36] + x[37] + x[38] - x[39];
+    out[4] = + x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] + x[16] + x[17] + x[18] + x[19] + x[20] - x[21] - x[22] + x[23] + x[24] - x[25] + x[26] + x[27] - x[28] - x[29] - x[30] - x[31] + x[32] - x[33] + x[34] - x[35] + x[36] + x[37] + x[38] + x[39];
+    out[5] = + x[0] + x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] + x[15] - x[16] + x[17] + x[18] + x[19] + x[20] + x[21] - x[22] - x[23] + x[24] + x[25] - x[26] + x[27] + x[28] - x[29] - x[30] - x[31] - x[32] + x[33] - x[34] + x[35] - x[36] + x[37] + x[38] + x[39];
+    out[6] = + x[0] + x[1] + x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] - x[17] + x[18] + x[19] + x[20] + x[21] + x[22] - x[23] - x[24] + x[25] + x[26] - x[27] + x[28] + x[29] - x[30] - x[31] - x[32] - x[33] + x[34] - x[35] + x[36] - x[37] + x[38] + x[39];
+    out[7] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19] + x[20] + x[21] + x[22] + x[23] - x[24] - x[25] + x[26] + x[27] - x[28] + x[29] + x[30] - x[31] - x[32] - x[33] - x[34] + x[35] - x[36] + x[37] - x[38] + x[39];
+    out[8] = + x[0] + x[1] + x[2] + x[3] + x[4] - x[5] - x[6] + x[7] + x[8] - x[9] + x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] - x[19] + x[20] + x[21] + x[22] + x[23] + x[24] - x[25] - x[26] + x[27] + x[28] - x[29] + x[30] + x[31] - x[32] - x[33] - x[34] - x[35] + x[36] - x[37] + x[38] - x[39];
+    out[9] = + x[0] - x[1] + x[2] + x[3] + x[4] + x[5] - x[6] - x[7] + x[8] + x[9] - x[10] + x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] - x[18] + x[19] + x[20] - x[21] + x[22] + x[23] + x[24] + x[25] - x[26] - x[27] + x[28] + x[29] - x[30] + x[31] + x[32] - x[33] - x[34] - x[35] - x[36] + x[37] - x[38] + x[39];
+    out[10] = + x[0] + x[1] - x[2] + x[3] + x[4] + x[5] + x[6] - x[7] - x[8] + x[9] + x[10] - x[11] + x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19] + x[20] + x[21] - x[22] + x[23] + x[24] + x[25] + x[26] - x[27] - x[28] + x[29] + x[30] - x[31] + x[32] + x[33] - x[34] - x[35] - x[36] - x[37] + x[38] - x[39];
+    out[11] = + x[0] - x[1] + x[2] - x[3] + x[4] + x[5] + x[6] + x[7] - x[8] - x[9] + x[10] + x[11] - x[12] + x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19] + x[20] - x[21] + x[22] - x[23] + x[24] + x[25] + x[26] + x[27] - x[28] - x[29] + x[30] + x[31] - x[32] + x[33] + x[34] - x[35] - x[36] - x[37] - x[38] + x[39];
+    out[12] = + x[0] + x[1] - x[2] + x[3] - x[4] + x[5] + x[6] + x[7] + x[8] - x[9] - x[10] + x[11] + x[12] - x[13] + x[14] + x[15] - x[16] - x[17] - x[18] - x[19] + x[20] + x[21] - x[22] + x[23] - x[24] + x[25] + x[26] + x[27] + x[28] - x[29] - x[30] + x[31] + x[32] - x[33] + x[34] + x[35] - x[36] - x[37] - x[38] - x[39];
+    out[13] = + x[0] - x[1] + x[2] - x[3] + x[4] - x[5] + x[6] + x[7] + x[8] + x[9] - x[10] - x[11] + x[12] + x[13] - x[14] + x[15] + x[16] - x[17] - x[18] - x[19] + x[20] - x[21] + x[22] - x[23] + x[24] - x[25] + x[26] + x[27] + x[28] + x[29] - x[30] - x[31] + x[32] + x[33] - x[34] + x[35] + x[36] - x[37] - x[38] - x[39];
+    out[14] = + x[0] - x[1] - x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] + x[9] + x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] + x[17] - x[18] - x[19] + x[20] - x[21] - x[22] + x[23] - x[24] + x[25] - x[26] + x[27] + x[28] + x[29] + x[30] - x[31] - x[32] + x[33] + x[34] - x[35] + x[36] + x[37] - x[38] - x[39];
+    out[15] = + x[0] - x[1] - x[2] - x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] + x[10] + x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19] + x[20] - x[21] - x[22] - x[23] + x[24] - x[25] + x[26] - x[27] + x[28] + x[29] + x[30] + x[31] - x[32] - x[33] + x[34] + x[35] - x[36] + x[37] + x[38] - x[39];
+    out[16] = + x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] + x[7] - x[8] + x[9] + x[10] + x[11] + x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19] + x[20] - x[21] - x[22] - x[23] - x[24] + x[25] - x[26] + x[27] - x[28] + x[29] + x[30] + x[31] + x[32] - x[33] - x[34] + x[35] + x[36] - x[37] + x[38] + x[39];
+    out[17] = + x[0] + x[1] - x[2] - x[3] - x[4] - x[5] + x[6] - x[7] + x[8] - x[9] + x[10] + x[11] + x[12] + x[13] - x[14] - x[15] + x[16] + x[17] - x[18] + x[19] + x[20] + x[21] - x[22] - x[23] - x[24] - x[25] + x[26] - x[27] + x[28] - x[29] + x[30] + x[31] + x[32] + x[33] - x[34] - x[35] + x[36] + x[37] - x[38] + x[39];
+    out[18] = + x[0] + x[1] + x[2] - x[3] - x[4] - x[5] - x[6] + x[7] - x[8] + x[9] - x[10] + x[11] + x[12] + x[13] + x[14] - x[15] - x[16] + x[17] + x[18] - x[19] + x[20] + x[21] + x[22] - x[23] - x[24] - x[25] - x[26] + x[27] - x[28] + x[29] - x[30] + x[31] + x[32] + x[33] + x[34] - x[35] - x[36] + x[37] + x[38] - x[39];
+    out[19] = + x[0] - x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] + x[8] - x[9] + x[10] - x[11] + x[12] + x[13] + x[14] + x[15] - x[16] - x[17] + x[18] + x[19] + x[20] - x[21] + x[22] + x[23] - x[24] - x[25] - x[26] - x[27] + x[28] - x[29] + x[30] - x[31] + x[32] + x[33] + x[34] + x[35] - x[36] - x[37] + x[38] + x[39];
+    out[20] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] - x[14] - x[15] - x[16] - x[17] - x[18] - x[19] - x[20] + x[21] + x[22] + x[23] + x[24] + x[25] + x[26] + x[27] + x[28] + x[29] + x[30] + x[31] + x[32] + x[33] + x[34] + x[35] + x[36] + x[37] + x[38] + x[39];
+    out[21] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] - x[7] - x[8] + x[9] - x[10] + x[11] - x[12] + x[13] + x[14] + x[15] + x[16] - x[17] - x[18] + x[19] - x[20] - x[21] + x[22] - x[23] - x[24] + x[25] + x[26] + x[27] + x[28] - x[29] + x[30] - x[31] + x[32] - x[33] - x[34] - x[35] - x[36] + x[37] + x[38] - x[39];
+    out[22] = + x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] - x[7] - x[8] - x[9] + x[10] - x[11] + x[12] - x[13] + x[14] + x[15] + x[16] + x[17] - x[18] - x[19] - x[20] - x[21] - x[22] + x[23] - x[24] - x[25] + x[26] + x[27] + x[28] + x[29] - x[30] + x[31] - x[32] + x[33] - x[34] - x[35] - x[36] - x[37] + x[38] + x[39];
+    out[23] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] + x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] + x[13] - x[14] + x[15] + x[16] + x[17] + x[18] - x[19] - x[20] + x[21] - x[22] - x[23] + x[24] - x[25] - x[26] + x[27] + x[28] + x[29] + x[30] - x[31] + x[32] - x[33] + x[34] - x[35] - x[36] - x[37] - x[38] + x[39];
+    out[24] = + x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] + x[16] + x[17] + x[18] + x[19] - x[20] + x[21] + x[22] - x[23] - x[24] + x[25] - x[26] - x[27] + x[28] + x[29] + x[30] + x[31] - x[32] + x[33] - x[34] + x[35] - x[36] - x[37] - x[38] - x[39];
+    out[25] = + x[0] + x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] + x[15] - x[16] + x[17] + x[18] + x[19] - x[20] - x[21] + x[22] + x[23] - x[24] - x[25] + x[26] - x[27] - x[28] + x[29] + x[30] + x[31] + x[32] - x[33] + x[34] - x[35] + x[36] - x[37] - x[38] - x[39];
+    out[26] = + x[0] + x[1] + x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] - x[17] + x[18] + x[19] - x[20] - x[21] - x[22] + x[23] + x[24] - x[25] - x[26] + x[27] - x[28] - x[29] + x[30] + x[31] + x[32] + x[33] - x[34] + x[35] - x[36] + x[37] - x[38] - x[39];
+    out[27] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19] - x[20] - x[21] - x[22] - x[23] + x[24] + x[25] - x[26] - x[27] + x[28] - x[29] - x[30] + x[31] + x[32] + x[33] + x[34] - x[35] + x[36] - x[37] + x[38] - x[39];
+    out[28] = + x[0] + x[1] + x[2] + x[3] + x[4] - x[5] - x[6] + x[7] + x[8] - x[9] + x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] - x[19] - x[20] - x[21] - x[22] - x[23] - x[24] + x[25] + x[26] - x[27] - x[28] + x[29] - x[30] - x[31] + x[32] + x[33] + x[34] + x[35] - x[36] + x[37] - x[38] + x[39];
+    out[29] = + x[0] - x[1] + x[2] + x[3] + x[4] + x[5] - x[6] - x[7] + x[8] + x[9] - x[10] + x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] - x[18] + x[19] - x[20] + x[21] - x[22] - x[23] - x[24] - x[25] + x[26] + x[27] - x[28] - x[29] + x[30] - x[31] - x[32] + x[33] + x[34] + x[35] + x[36] - x[37] + x[38] - x[39];
+    out[30] = + x[0] + x[1] - x[2] + x[3] + x[4] + x[5] + x[6] - x[7] - x[8] + x[9] + x[10] - x[11] + x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19] - x[20] - x[21] + x[22] - x[23] - x[24] - x[25] - x[26] + x[27] + x[28] - x[29] - x[30] + x[31] - x[32] - x[33] + x[34] + x[35] + x[36] + x[37] - x[38] + x[39];
+    out[31] = + x[0] - x[1] + x[2] - x[3] + x[4] + x[5] + x[6] + x[7] - x[8] - x[9] + x[10] + x[11] - x[12] + x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19] - x[20] + x[21] - x[22] + x[23] - x[24] - x[25] - x[26] - x[27] + x[28] + x[29] - x[30] - x[31] + x[32] - x[33] - x[34] + x[35] + x[36] + x[37] + x[38] - x[39];
+    out[32] = + x[0] + x[1] - x[2] + x[3] - x[4] + x[5] + x[6] + x[7] + x[8] - x[9] - x[10] + x[11] + x[12] - x[13] + x[14] + x[15] - x[16] - x[17] - x[18] - x[19] - x[20] - x[21] + x[22] - x[23] + x[24] - x[25] - x[26] - x[27] - x[28] + x[29] + x[30] - x[31] - x[32] + x[33] - x[34] - x[35] + x[36] + x[37] + x[38] + x[39];
+    out[33] = + x[0] - x[1] + x[2] - x[3] + x[4] - x[5] + x[6] + x[7] + x[8] + x[9] - x[10] - x[11] + x[12] + x[13] - x[14] + x[15] + x[16] - x[17] - x[18] - x[19] - x[20] + x[21] - x[22] + x[23] - x[24] + x[25] - x[26] - x[27] - x[28] - x[29] + x[30] + x[31] - x[32] - x[33] + x[34] - x[35] - x[36] + x[37] + x[38] + x[39];
+    out[34] = + x[0] - x[1] - x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] + x[9] + x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] + x[17] - x[18] - x[19] - x[20] + x[21] + x[22] - x[23] + x[24] - x[25] + x[26] - x[27] - x[28] - x[29] - x[30] + x[31] + x[32] - x[33] - x[34] + x[35] - x[36] - x[37] + x[38] + x[39];
+    out[35] = + x[0] - x[1] - x[2] - x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] + x[10] + x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19] - x[20] + x[21] + x[22] + x[23] - x[24] + x[25] - x[26] + x[27] - x[28] - x[29] - x[30] - x[31] + x[32] + x[33] - x[34] - x[35] + x[36] - x[37] - x[38] + x[39];
+    out[36] = + x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] + x[7] - x[8] + x[9] + x[10] + x[11] + x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19] - x[20] + x[21] + x[22] + x[23] + x[24] - x[25] + x[26] - x[27] + x[28] - x[29] - x[30] - x[31] - x[32] + x[33] + x[34] - x[35] - x[36] + x[37] - x[38] - x[39];
+    out[37] = + x[0] + x[1] - x[2] - x[3] - x[4] - x[5] + x[6] - x[7] + x[8] - x[9] + x[10] + x[11] + x[12] + x[13] - x[14] - x[15] + x[16] + x[17] - x[18] + x[19] - x[20] - x[21] + x[22] + x[23] + x[24] + x[25] - x[26] + x[27] - x[28] + x[29] - x[30] - x[31] - x[32] - x[33] + x[34] + x[35] - x[36] - x[37] + x[38] - x[39];
+    out[38] = + x[0] + x[1] + x[2] - x[3] - x[4] - x[5] - x[6] + x[7] - x[8] + x[9] - x[10] + x[11] + x[12] + x[13] + x[14] - x[15] - x[16] + x[17] + x[18] - x[19] - x[20] - x[21] - x[22] + x[23] + x[24] + x[25] + x[26] - x[27] + x[28] - x[29] + x[30] - x[31] - x[32] - x[33] - x[34] + x[35] + x[36] - x[37] - x[38] + x[39];
+    out[39] = + x[0] - x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] + x[8] - x[9] + x[10] - x[11] + x[12] + x[13] + x[14] + x[15] - x[16] - x[17] + x[18] + x[19] - x[20] + x[21] - x[22] - x[23] + x[24] + x[25] + x[26] + x[27] - x[28] + x[29] - x[30] + x[31] - x[32] - x[33] - x[34] - x[35] + x[36] + x[37] - x[38] - x[39];
+    #pragma unroll
+    for (int i = 0; i < 40; i++) { x[i] = out[i]; }
+}
+
diff --git a/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test.cpp b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test.cpp
new file mode 100644
index 00000000000..8587b600a3a
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <executorch/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h>
+#include <executorch/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.h>
+
+using executorch::runtime::testing::fast_hadamard_transform_28N_with_transpose;
+using executorch::runtime::testing::random_floats;
+using executorch::runtime::testing::reference_fht_impl;
+
+TEST(FastHadamardTransformTest, SingleElement) {
+  // FHT of a single element is a no-op.
+  std::array<float, 1> data = {{42}};
+  executorch::fast_hadamard_transform(data.data(), 0);
+  EXPECT_EQ(data[0], 42);
+}
+
+TEST(FastHadamardTransformTest, LargerInput) {
+  std::vector<float> data = random_floats(4096);
+
+  auto expected = data;
+  reference_fht_impl(expected.data(), expected.size());
+
+  auto actual = data;
+  executorch::fast_hadamard_transform(actual.data(), 12);
+
+  for (int ii = 0; ii < expected.size(); ++ii) {
+    EXPECT_FLOAT_EQ(actual[ii], expected[ii]);
+  }
+}
+
+TEST(FastHadamardTransform28NTest, Basic) {
+  std::vector<float> data = random_floats(1024 * 28);
+
+  auto expected = data;
+  fast_hadamard_transform_28N_with_transpose(expected.data(), 10);
+
+  auto actual = data;
+  executorch::fast_hadamard_transform_28N(actual.data(), 10);
+
+  for (int ii = 0; ii < actual.size(); ++ii) {
+    EXPECT_FLOAT_EQ(actual[ii], expected[ii]);
+  }
+}
+
+namespace {
+constexpr int32_t qmin = -(1 << 15) + 1;
+constexpr int32_t qmax = -qmin;
+
+int16_t quantize(float x, float scale) {
+  float scaled = x / scale;
+  // XXX: Supposed to round ties to even, but this is just test code.
+  int32_t scaled_int =
+      std::clamp((int32_t)std::lround<int32_t>(scaled), qmin, qmax);
+  return static_cast<int16_t>(scaled_int);
+}
+
+template <typename T>
+std::vector<T> quantize(const std::vector<float>& data, float scale) {
+  std::vector<T> result;
+  result.reserve(data.size());
+  for (const float unquant : data) {
+    result.push_back(quantize(unquant, scale));
+  }
+  return result;
+}
+
+template <typename T>
+std::pair<std::vector<T>, float> quantize(const std::vector<float>& data) {
+  auto [minIt, maxIt] = std::minmax_element(data.begin(), data.end());
+  float scale = (*maxIt - *minIt) / (qmax - qmin);
+  return {quantize<T>(data, scale), scale};
+}
+
+template <typename T>
+float dequantize(T x, float scale) {
+  return x * scale;
+}
+
+template <typename T>
+std::vector<float> dequantize(const std::vector<T>& data, float scale) {
+  static_assert(!std::is_same_v<T, float>);
+  std::vector<float> result;
+  result.reserve(data.size());
+  for (const T quant : data) {
+    result.push_back(dequantize(quant, scale));
+  }
+  return result;
+}
+
+#define EXPECT_CLOSE_IMPL(a, b, atol, rtol)             \
+  EXPECT_LE(std::abs(a - b), atol + rtol * std::abs(b)) \
+      << "a: " << a << ", b: " << b
+#define EXPECT_CLOSE(a, b) EXPECT_CLOSE_IMPL(a, b, 2e-4, 1e-4)
+
+void testQuantizedFastHadamardTransform(int logN) {
+  std::vector<float> data = random_floats(1 << logN);
+
+  auto [qdata, scale] = quantize<int16_t>(data);
+
+  auto expected_unquant = dequantize(qdata, scale);
+  reference_fht_impl(expected_unquant.data(), expected_unquant.size());
+  auto expected = quantize<int16_t>(expected_unquant, scale);
+
+  auto actual = qdata;
+  executorch::fast_hadamard_transform_symmetric_quantized_s16(
+      actual.data(), logN);
+
+  for (int ii = 0; ii < expected.size(); ++ii) {
+    EXPECT_CLOSE(
+        dequantize(actual[ii], scale), dequantize(expected[ii], scale));
+  }
+}
+
+} // namespace
+
+TEST(QuantizedFastHadamardTransformTest, Basic) {
+  testQuantizedFastHadamardTransform(12); // 4096
+}
+
+TEST(QuantizedFastHadamardTransformTest, OddLogN) {
+  testQuantizedFastHadamardTransform(11); // 2048
+}
+
+TEST(QuantizedFastHadamardTransform28NTest, Basic) {
+  std::vector<float> data = random_floats(1024 * 28);
+
+  auto [qdata, scale] = quantize<int16_t>(data);
+
+  auto expected_unquant = dequantize(qdata, scale);
+  fast_hadamard_transform_28N_with_transpose(expected_unquant.data(), 10);
+  auto expected = quantize<int16_t>(expected_unquant, scale);
+
+  auto actual = qdata;
+  executorch::fast_hadamard_transform_symmetric_quantized_s16_28N(
+      actual.data(), 10);
+
+  for (int ii = 0; ii < expected.size(); ++ii) {
+    EXPECT_CLOSE(
+        dequantize(actual[ii], scale), dequantize(expected[ii], scale));
+  }
+}
diff --git a/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.cpp b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.cpp
new file mode 100644
index 00000000000..25199f481ee
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.h>
+#include <executorch/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.h>
+
+#include <cmath>
+#include <random>
+#include <vector>
+
+namespace executorch::runtime::testing {
+
+void reference_fht_impl(float* buf, int n) {
+  dumb_fht(buf, std::log2<int>(n));
+  const auto root_n = std::sqrt(n);
+  for (int ii = 0; ii < n; ++ii) {
+    buf[ii] /= root_n;
+  }
+}
+
+std::vector<float> random_floats(int howMany) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::normal_distribution<float> dist;
+  std::vector<float> data(howMany);
+  for (int ii = 0; ii < data.size(); ++ii) {
+    data[ii] = dist(gen);
+  }
+  return data;
+}
+
+} // namespace executorch::runtime::testing
diff --git a/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.h b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.h
new file mode 100644
index 00000000000..aaf4a9e5c0f
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <executorch/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h>
+#include <executorch/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h>
+
+namespace executorch::runtime::testing {
+void reference_fht_impl(float* buf, int n);
+
+// Alternate implementation of fast_hadamard_transform_28N to mutation
+// test against. Benchmarking suggests this one is slower, which is
+// why it's in the test.
+template <typename T>
+void fast_hadamard_transform_28N_with_transpose(T* vec, int log2_vec_size) {
+  const int vec_size = (1 << log2_vec_size);
+  for (int ii = 0; ii < 28; ++ii) {
+    executorch::fast_hadamard_transform(&vec[ii * vec_size], log2_vec_size);
+  }
+  std::unique_ptr<T[]> transposed = std::make_unique<T[]>(28 * vec_size);
+  for (int ii = 0; ii < 28; ++ii) {
+    for (int jj = 0; jj < vec_size; ++jj) {
+      transposed[jj * 28 + ii] = vec[ii * vec_size + jj];
+    }
+  }
+  for (int ii = 0; ii < vec_size; ++ii) {
+    hadamard_mult_28(&transposed[ii * 28]);
+  }
+  for (int jj = 0; jj < vec_size; ++jj) {
+    for (int ii = 0; ii < 28; ++ii) {
+      vec[ii * vec_size + jj] = transposed[jj * 28 + ii];
+    }
+  }
+}
+
+std::vector<float> random_floats(int howMany);
+
+} // namespace executorch::runtime::testing
diff --git a/extension/llm/custom_ops/spinquant/test/op_fast_hadamard_transform_test.cpp b/extension/llm/custom_ops/spinquant/test/op_fast_hadamard_transform_test.cpp
new file mode 100644
index 00000000000..7ab2d6c3002
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/op_fast_hadamard_transform_test.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/custom_ops/op_fast_hadamard_transform.h>
+#include <executorch/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.h>
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+
+using exec_aten::Tensor;
+
+using executorch::runtime::testing::fast_hadamard_transform_28N_with_transpose;
+using executorch::runtime::testing::random_floats;
+using executorch::runtime::testing::reference_fht_impl;
+
+namespace {
+Tensor& fast_hadamard_transform_nocontext(const Tensor& vec, Tensor& out) {
+  exec_aten::RuntimeContext context;
+  return torch::executor::native::fast_hadamard_transform_out(
+      context, vec, out);
+}
+} // namespace
+
+TEST(OpFastHadamardTransformTest, EmptyInput) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  auto vec = tfFloat.zeros({0});
+  auto out = tfFloat.zeros({0});
+  auto result = fast_hadamard_transform_nocontext(vec, out);
+  EXPECT_EQ(result.numel(), 0);
+}
+
+TEST(OpFastHadamardTransformTest, SingleElementInput) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  auto vec = tfFloat.ones({1});
+  auto out = tfFloat.zeros({1});
+  auto result = fast_hadamard_transform_nocontext(vec, out);
+  EXPECT_EQ(result.numel(), 1);
+  // FHT of a single element is a no-op.
+  EXPECT_EQ(result.const_data_ptr<float>()[0], 1);
+}
+
+TEST(OpFastHadamardTransformTest, FourKInput) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  std::vector<float> data = random_floats(4096);
+  auto vec = tfFloat.make({4096}, data);
+  auto out = tfFloat.zeros({4096});
+  auto result = fast_hadamard_transform_nocontext(vec, out);
+
+  std::vector<float> reference_result = data;
+  reference_fht_impl(reference_result.data(), reference_result.size());
+
+  const float* const result_data = result.const_data_ptr<float>();
+  for (int ii = 0; ii < data.size(); ++ii) {
+    EXPECT_FLOAT_EQ(result_data[ii], reference_result[ii]);
+  }
+}
+
+TEST(OpFastHadamardTransformTest, MultipleRows) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  std::vector<float> data = random_floats(8 * 8 * 8);
+  auto mat = tfFloat.make({8, 8, 8}, data);
+  auto out = tfFloat.zeros({8, 8, 8});
+
+  auto result = fast_hadamard_transform_nocontext(mat, out);
+
+  std::vector<float> reference_result = data;
+  for (int ii = 0; ii < 8; ++ii) {
+    for (int jj = 0; jj < 8; ++jj) {
+      reference_fht_impl(&reference_result[ii * 64 + jj * 8], 8);
+    }
+  }
+
+  const float* const result_data = result.const_data_ptr<float>();
+  for (int ii = 0; ii < data.size(); ++ii) {
+    EXPECT_FLOAT_EQ(result_data[ii], reference_result[ii]);
+  }
+}
+
+TEST(OpFastHadamardTransformTest, Basic28N) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  constexpr int kTestLogSize = 7;
+  constexpr int kTestPowerOfTwoSize = 1 << kTestLogSize;
+  constexpr int kTestTotalSize = kTestPowerOfTwoSize * 28;
+  std::vector<float> data = random_floats(kTestTotalSize);
+  auto vec = tfFloat.make({kTestTotalSize}, data);
+  auto out = tfFloat.zeros({kTestTotalSize});
+
+  // The operator is supposed to autodetect 28 * 2**N size and handle
+  // accordingly.
+  auto result = fast_hadamard_transform_nocontext(vec, out);
+
+  std::vector<float> reference_result = data;
+  fast_hadamard_transform_28N_with_transpose(
+      reference_result.data(), kTestLogSize);
+
+  const float* const result_data = result.const_data_ptr<float>();
+  for (int ii = 0; ii < data.size(); ++ii) {
+    EXPECT_FLOAT_EQ(result_data[ii], reference_result[ii]);
+  }
+}
+
+TEST(OpFastHadamardTransformTest, InvalidSize) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  auto mat = tfFloat.zeros({3});
+  auto out = tfFloat.zeros({3});
+
+  exec_aten::RuntimeContext context;
+  torch::executor::native::fast_hadamard_transform_out(context, mat, out);
+  EXPECT_NE(context.failure_state(), executorch::runtime::Error::Ok);
+}
diff --git a/extension/llm/custom_ops/spinquant/test/targets.bzl b/extension/llm/custom_ops/spinquant/test/targets.bzl
new file mode 100644
index 00000000000..47ae39752a8
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/targets.bzl
@@ -0,0 +1,42 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    runtime.cxx_library(
+        name = "fast_hadamard_transform_test_impl",
+        srcs = ["fast_hadamard_transform_test_impl.cpp"],
+        exported_headers = [
+            "fast_hadamard_transform_special_unstrided_cpu.h",
+            "fast_hadamard_transform_test_impl.h",
+        ],
+        exported_deps = [
+            "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
+        ],
+        deps = [
+            "//executorch/extension/llm/custom_ops/spinquant/third-party/FFHT:dumb_fht",
+        ],
+    )
+
+    runtime.cxx_test(
+        name = "fast_hadamard_transform_test",
+        srcs = ["fast_hadamard_transform_test.cpp"],
+        deps = [
+            ":fast_hadamard_transform_test_impl",
+            "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
+        ],
+    )
+
+    runtime.cxx_test(
+        name = "op_fast_hadamard_transform_test",
+        srcs = ["op_fast_hadamard_transform_test.cpp"],
+        deps = [
+            ":fast_hadamard_transform_test_impl",
+            "//executorch/extension/llm/custom_ops:custom_ops",
+            "//executorch/kernels/test:test_util",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+        ],
+    )
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/LICENSE.md b/extension/llm/custom_ops/spinquant/third-party/FFHT/LICENSE.md
new file mode 100644
index 00000000000..52c4e01cd49
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/LICENSE.md
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Alexandr Andoni, Piotr Indyk, Thijs Laarhoven,
+Ilya Razenshteyn, Ludwig Schmidt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile b/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
new file mode 100644
index 00000000000..7cbeb3ddae9
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
@@ -0,0 +1,21 @@
+CC = gcc
+CFLAGS = -O3 -march=native -std=c99 -pedantic -Wall -Wextra -Wshadow -Wpointer-arith -Wcast-qual -Wstrict-prototypes -Wmissing-prototypes
+
+all: test_float test_double fast_copy.o fht.o
+
+OBJ := dumb_fht.o fast_copy.o fht.o
+
+%.o: %.c
+	$(CC) $< -o $@ -c $(CFLAGS)
+
+test_%: test_%.c $(OBJ)
+	$(CC) $< $(OBJ) -o $@ $(CFLAGS)
+
+test_double_header_only: test_double_header_only.c
+	$(CC) $< -o $@ $(CFLAGS)
+
+test_float_header_only: test_double_header_only.c
+	$(CC) $< -o $@ $(CFLAGS)
+
+clean:
+	rm -f test_float test_double test_float_header_only test_double_header_only $(OBJ)
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md b/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md
new file mode 100644
index 00000000000..dcc9840f25a
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md
@@ -0,0 +1,5 @@
+# Fast Fast Hadamard Transform
+
+This directory contains a fork of https://github.com/FALCONN-LIB/FFHT
+(License: https://github.com/FALCONN-LIB/FFHT/blob/master/LICENSE.md)
+focused on ARM64 NEON code generation.
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/TARGETS b/extension/llm/custom_ops/spinquant/third-party/FFHT/TARGETS
new file mode 100644
index 00000000000..0a42614a385
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.c
new file mode 100644
index 00000000000..8f30f3e8ea3
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.c
@@ -0,0 +1,17 @@
+#include "dumb_fht.h"
+
+void dumb_fht(float* buf, int log_n) {
+  int n = 1 << log_n;
+  for (int i = 0; i < log_n; ++i) {
+    int s1 = 1 << i;
+    int s2 = s1 << 1;
+    for (int j = 0; j < n; j += s2) {
+      for (int k = 0; k < s1; ++k) {
+        float u = buf[j + k];
+        float v = buf[j + k + s1];
+        buf[j + k] = u + v;
+        buf[j + k + s1] = u - v;
+      }
+    }
+  }
+}
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.h
new file mode 100644
index 00000000000..8ea702d449e
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.h
@@ -0,0 +1,14 @@
+#ifndef DUMB_FHT_H
+#define DUMB_FHT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void dumb_fht(float* buf, int log_n);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /* DUMB_FHT_H */
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/example.py b/extension/llm/custom_ops/spinquant/third-party/FFHT/example.py
new file mode 100644
index 00000000000..576c89830da
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/example.py
@@ -0,0 +1,20 @@
+import numpy as np
+import ffht
+import timeit
+import sys
+
+reps = 1000
+n = 2**20
+chunk_size = 1024
+
+a = np.random.randn(n).astype(np.float32)
+
+t1 = timeit.default_timer()
+for i in range(reps):
+    ffht.fht(a)
+t2 = timeit.default_timer()
+
+if sys.version_info[0] == 2:
+    print (t2 - t1 + 0.0) / (reps + 0.0)
+if sys.version_info[0] == 3:
+    print('{}'.format((t2 - t1 + 0.0) / (reps + 0.0)))
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.c
new file mode 100644
index 00000000000..bf3cbd1986d
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.c
@@ -0,0 +1,60 @@
+#include "fast_copy.h"
+#include <string.h>
+#include <stdlib.h>
+#if (defined(__x86_64__) || defined(__i386__))
+#  include <x86intrin.h>
+#endif
+
+#ifdef FHT_HEADER_ONLY
+#  define _STORAGE_ static inline
+#else
+#  define _STORAGE_
+#endif
+
+// These functions all assume that the size of memory being copied is a power of 2.
+
+#if _FEATURE_AVX512F
+// If n is less than 64, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
+_STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
+    if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
+        return memcpy(out, in, n);
+    }
+    n >>= 6;
+    for(__m512 *ov = (__m512 *)out, *iv = (__m512 *)in; n--;) {
+        _mm512_storeu_ps((float *)(ov++), _mm512_loadu_ps((float *)(iv++)));
+    }
+    return out;
+}
+#elif __AVX2__
+// If n is less than 32, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
+_STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
+    if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
+        return memcpy(out, in, n);
+    }
+    n >>= 5;
+    for(__m256 *ov = (__m256 *)out, *iv = (__m256 *)in; n--;) {
+        _mm256_storeu_ps((float *)(ov++), _mm256_loadu_ps((float *)(iv++)));
+    }
+    return out;
+}
+#elif __SSE2__
+// If n is less than 16, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
+_STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
+    if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
+        return memcpy(out, in, n);
+    }
+    n >>= 4;
+    for(__m128 *ov = (__m128 *)out, *iv = (__m128 *)in; n--;) {
+        _mm_storeu_ps((float *)(ov++), _mm_loadu_ps((float *)(iv++)));
+    }
+    return out;
+}
+#else
+_STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
+    return memcpy(out, in, n);
+}
+#endif
+
+#ifdef FHT_HEADER_ONLY
+#  undef _STORAGE_
+#endif
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.h
new file mode 100644
index 00000000000..f4d4fabc01a
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.h
@@ -0,0 +1,21 @@
+#ifndef _FAST_COPY_H__
+#define _FAST_COPY_H__
+#include <stdlib.h>
+
+#ifndef FAST_COPY_MEMCPY_THRESHOLD
+#  define FAST_COPY_MEMCPY_THRESHOLD ((size_t)1ull << 20)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef FHT_HEADER_ONLY
+#include "fast_copy.c"
+#else
+void *fast_copy(void *out, void *in, size_t m);
+#endif
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // _FAST_COPY_H__
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.c
new file mode 100644
index 00000000000..c374ff618f6
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.c
@@ -0,0 +1,3 @@
+#include "fht.h"
+
+#include "fht_impl.h"
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h
new file mode 100644
index 00000000000..3bc78e353bc
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h
@@ -0,0 +1,45 @@
+#ifndef _FHT_H_
+#define _FHT_H_
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int fht_float(float* buf, int log_n);
+#ifndef __aarch64__
+int fht_double(double* buf, int log_n);
+#endif
+int fht_float_oop(float* in, float* out, int log_n);
+#ifndef __aarch64__
+int fht_double_oop(double* in, double* out, int log_n);
+#endif
+
+#ifdef __cplusplus
+
+} // extern "C"
+
+static inline int fht(float* buf, int log_n) {
+  return fht_float(buf, log_n);
+}
+
+#ifndef __aarch64__
+static inline int fht(double* buf, int log_n) {
+  return fht_double(buf, log_n);
+}
+#endif
+
+static inline int fht(float* buf, float* out, int log_n) {
+  return fht_float_oop(buf, out, log_n);
+}
+
+#ifndef __aarch64__
+static inline int fht(double* buf, double* out, int log_n) {
+  return fht_double_oop(buf, out, log_n);
+}
+#endif
+
+#endif
+
+#endif
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_avx.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_avx.c
new file mode 100644
index 00000000000..721130dc9f6
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_avx.c
@@ -0,0 +1,19671 @@
+#include "fht.h"
+static inline void helper_float_1(float *buf);
+static inline void helper_float_1(float *buf) {
+  for (int j = 0; j < 2; j += 2) {
+    for (int k = 0; k < 1; ++k) {
+      float u = buf[j + k];
+      float v = buf[j + k + 1];
+      buf[j + k] = u + v;
+      buf[j + k + 1] = u - v;
+    }
+  }
+}
+static inline void helper_float_2(float *buf);
+static inline void helper_float_2(float *buf) {
+  for (int j = 0; j < 4; j += 2) {
+    for (int k = 0; k < 1; ++k) {
+      float u = buf[j + k];
+      float v = buf[j + k + 1];
+      buf[j + k] = u + v;
+      buf[j + k + 1] = u - v;
+    }
+  }
+  for (int j = 0; j < 4; j += 4) {
+    for (int k = 0; k < 2; ++k) {
+      float u = buf[j + k];
+      float v = buf[j + k + 2];
+      buf[j + k] = u + v;
+      buf[j + k + 2] = u - v;
+    }
+  }
+}
+static inline void helper_float_3(float *buf);
+static inline void helper_float_3(float *buf) {
+  for (int j = 0; j < 8; j += 8) {
+    __asm__ volatile (
+      "vmovups (%0), %%ymm0\n"
+      "vpermilps $160, %%ymm0, %%ymm8\n"
+      "vpermilps $245, %%ymm0, %%ymm9\n"
+      "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+      "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+      "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+      "vpermilps $68, %%ymm0, %%ymm8\n"
+      "vpermilps $238, %%ymm0, %%ymm9\n"
+      "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+      "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+      "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+      "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+      "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+      "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+      "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+      "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+      "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+      "vmovups %%ymm0, (%0)\n"
+      :: "r"(buf + j) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+    );
+  }
+}
+static inline void helper_float_4(float *buf);
+static inline void helper_float_4(float *buf) {
+  for (int j = 0; j < 16; j += 16) {
+    for (int k = 0; k < 8; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vpermilps $160, %%ymm0, %%ymm8\n"
+        "vpermilps $245, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilps $160, %%ymm1, %%ymm8\n"
+        "vpermilps $245, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilps $68, %%ymm0, %%ymm8\n"
+        "vpermilps $238, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+        "vpermilps $68, %%ymm1, %%ymm8\n"
+        "vpermilps $238, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 8) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_5(float *buf);
+static inline void helper_float_5(float *buf) {
+  for (int j = 0; j < 32; j += 32) {
+    for (int k = 0; k < 8; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vpermilps $160, %%ymm0, %%ymm8\n"
+        "vpermilps $245, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilps $160, %%ymm1, %%ymm8\n"
+        "vpermilps $245, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilps $160, %%ymm2, %%ymm8\n"
+        "vpermilps $245, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilps $160, %%ymm3, %%ymm8\n"
+        "vpermilps $245, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilps $68, %%ymm0, %%ymm8\n"
+        "vpermilps $238, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+        "vpermilps $68, %%ymm1, %%ymm8\n"
+        "vpermilps $238, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+        "vpermilps $68, %%ymm2, %%ymm8\n"
+        "vpermilps $238, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+        "vpermilps $68, %%ymm3, %%ymm8\n"
+        "vpermilps $238, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vmovups %%ymm0, (%0)\n"
+        "vmovups %%ymm1, (%1)\n"
+        "vmovups %%ymm2, (%2)\n"
+        "vmovups %%ymm3, (%3)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_6(float *buf);
+static inline void helper_float_6(float *buf) {
+  for (int j = 0; j < 64; j += 64) {
+    for (int k = 0; k < 8; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vmovups (%4), %%ymm4\n"
+        "vmovups (%5), %%ymm5\n"
+        "vmovups (%6), %%ymm6\n"
+        "vmovups (%7), %%ymm7\n"
+        "vpermilps $160, %%ymm0, %%ymm8\n"
+        "vpermilps $245, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilps $160, %%ymm1, %%ymm8\n"
+        "vpermilps $245, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilps $160, %%ymm2, %%ymm8\n"
+        "vpermilps $245, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilps $160, %%ymm3, %%ymm8\n"
+        "vpermilps $245, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilps $160, %%ymm4, %%ymm8\n"
+        "vpermilps $245, %%ymm4, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilps $160, %%ymm5, %%ymm8\n"
+        "vpermilps $245, %%ymm5, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilps $160, %%ymm6, %%ymm8\n"
+        "vpermilps $245, %%ymm6, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilps $160, %%ymm7, %%ymm8\n"
+        "vpermilps $245, %%ymm7, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+        "vpermilps $68, %%ymm0, %%ymm8\n"
+        "vpermilps $238, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+        "vpermilps $68, %%ymm1, %%ymm8\n"
+        "vpermilps $238, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+        "vpermilps $68, %%ymm2, %%ymm8\n"
+        "vpermilps $238, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+        "vpermilps $68, %%ymm3, %%ymm8\n"
+        "vpermilps $238, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+        "vpermilps $68, %%ymm4, %%ymm8\n"
+        "vpermilps $238, %%ymm4, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+        "vpermilps $68, %%ymm5, %%ymm8\n"
+        "vpermilps $238, %%ymm5, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+        "vpermilps $68, %%ymm6, %%ymm8\n"
+        "vpermilps $238, %%ymm6, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+        "vpermilps $68, %%ymm7, %%ymm8\n"
+        "vpermilps $238, %%ymm7, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        "vmovups %%ymm10, (%2)\n"
+        "vmovups %%ymm11, (%3)\n"
+        "vmovups %%ymm12, (%4)\n"
+        "vmovups %%ymm13, (%5)\n"
+        "vmovups %%ymm14, (%6)\n"
+        "vmovups %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+void helper_float_7_recursive(float *buf, int depth);
+void helper_float_7_recursive(float *buf, int depth) {
+  if (depth == 7) {
+    for (int j = 0; j < 128; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_7(float *buf);
+void helper_float_7(float *buf) {
+  helper_float_7_recursive(buf, 7);
+}
+void helper_float_8_recursive(float *buf, int depth);
+void helper_float_8_recursive(float *buf, int depth) {
+  if (depth == 6) {
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_float_8_recursive(buf + 0, 6);
+    helper_float_8_recursive(buf + 64, 6);
+    helper_float_8_recursive(buf + 128, 6);
+    helper_float_8_recursive(buf + 192, 6);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_8(float *buf);
+void helper_float_8(float *buf) {
+  helper_float_8_recursive(buf, 8);
+}
+static inline void helper_float_9(float *buf);
+static inline void helper_float_9(float *buf) {
+  for (int j = 0; j < 512; j += 64) {
+    for (int k = 0; k < 8; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vmovups (%4), %%ymm4\n"
+        "vmovups (%5), %%ymm5\n"
+        "vmovups (%6), %%ymm6\n"
+        "vmovups (%7), %%ymm7\n"
+        "vpermilps $160, %%ymm0, %%ymm8\n"
+        "vpermilps $245, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilps $160, %%ymm1, %%ymm8\n"
+        "vpermilps $245, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilps $160, %%ymm2, %%ymm8\n"
+        "vpermilps $245, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilps $160, %%ymm3, %%ymm8\n"
+        "vpermilps $245, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilps $160, %%ymm4, %%ymm8\n"
+        "vpermilps $245, %%ymm4, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilps $160, %%ymm5, %%ymm8\n"
+        "vpermilps $245, %%ymm5, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilps $160, %%ymm6, %%ymm8\n"
+        "vpermilps $245, %%ymm6, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilps $160, %%ymm7, %%ymm8\n"
+        "vpermilps $245, %%ymm7, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+        "vpermilps $68, %%ymm0, %%ymm8\n"
+        "vpermilps $238, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+        "vpermilps $68, %%ymm1, %%ymm8\n"
+        "vpermilps $238, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+        "vpermilps $68, %%ymm2, %%ymm8\n"
+        "vpermilps $238, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+        "vpermilps $68, %%ymm3, %%ymm8\n"
+        "vpermilps $238, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+        "vpermilps $68, %%ymm4, %%ymm8\n"
+        "vpermilps $238, %%ymm4, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+        "vpermilps $68, %%ymm5, %%ymm8\n"
+        "vpermilps $238, %%ymm5, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+        "vpermilps $68, %%ymm6, %%ymm8\n"
+        "vpermilps $238, %%ymm6, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+        "vpermilps $68, %%ymm7, %%ymm8\n"
+        "vpermilps $238, %%ymm7, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        "vmovups %%ymm10, (%2)\n"
+        "vmovups %%ymm11, (%3)\n"
+        "vmovups %%ymm12, (%4)\n"
+        "vmovups %%ymm13, (%5)\n"
+        "vmovups %%ymm14, (%6)\n"
+        "vmovups %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 512; j += 512) {
+    for (int k = 0; k < 64; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vmovups (%4), %%ymm4\n"
+        "vmovups (%5), %%ymm5\n"
+        "vmovups (%6), %%ymm6\n"
+        "vmovups (%7), %%ymm7\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        "vmovups %%ymm10, (%2)\n"
+        "vmovups %%ymm11, (%3)\n"
+        "vmovups %%ymm12, (%4)\n"
+        "vmovups %%ymm13, (%5)\n"
+        "vmovups %%ymm14, (%6)\n"
+        "vmovups %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+void helper_float_10_recursive(float *buf, int depth);
+void helper_float_10_recursive(float *buf, int depth) {
+  if (depth == 10) {
+    for (int j = 0; j < 1024; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_10(float *buf);
+void helper_float_10(float *buf) {
+  helper_float_10_recursive(buf, 10);
+}
+void helper_float_11_recursive(float *buf, int depth);
+void helper_float_11_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_11(float *buf);
+void helper_float_11(float *buf) {
+  helper_float_11_recursive(buf, 11);
+}
+static inline void helper_float_12(float *buf);
+static inline void helper_float_12(float *buf) {
+  for (int j = 0; j < 4096; j += 64) {
+    for (int k = 0; k < 8; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vmovups (%4), %%ymm4\n"
+        "vmovups (%5), %%ymm5\n"
+        "vmovups (%6), %%ymm6\n"
+        "vmovups (%7), %%ymm7\n"
+        "vpermilps $160, %%ymm0, %%ymm8\n"
+        "vpermilps $245, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilps $160, %%ymm1, %%ymm8\n"
+        "vpermilps $245, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilps $160, %%ymm2, %%ymm8\n"
+        "vpermilps $245, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilps $160, %%ymm3, %%ymm8\n"
+        "vpermilps $245, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilps $160, %%ymm4, %%ymm8\n"
+        "vpermilps $245, %%ymm4, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilps $160, %%ymm5, %%ymm8\n"
+        "vpermilps $245, %%ymm5, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilps $160, %%ymm6, %%ymm8\n"
+        "vpermilps $245, %%ymm6, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilps $160, %%ymm7, %%ymm8\n"
+        "vpermilps $245, %%ymm7, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+        "vpermilps $68, %%ymm0, %%ymm8\n"
+        "vpermilps $238, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+        "vpermilps $68, %%ymm1, %%ymm8\n"
+        "vpermilps $238, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+        "vpermilps $68, %%ymm2, %%ymm8\n"
+        "vpermilps $238, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+        "vpermilps $68, %%ymm3, %%ymm8\n"
+        "vpermilps $238, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+        "vpermilps $68, %%ymm4, %%ymm8\n"
+        "vpermilps $238, %%ymm4, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+        "vpermilps $68, %%ymm5, %%ymm8\n"
+        "vpermilps $238, %%ymm5, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+        "vpermilps $68, %%ymm6, %%ymm8\n"
+        "vpermilps $238, %%ymm6, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+        "vpermilps $68, %%ymm7, %%ymm8\n"
+        "vpermilps $238, %%ymm7, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        "vmovups %%ymm10, (%2)\n"
+        "vmovups %%ymm11, (%3)\n"
+        "vmovups %%ymm12, (%4)\n"
+        "vmovups %%ymm13, (%5)\n"
+        "vmovups %%ymm14, (%6)\n"
+        "vmovups %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 4096; j += 512) {
+    for (int k = 0; k < 64; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vmovups (%4), %%ymm4\n"
+        "vmovups (%5), %%ymm5\n"
+        "vmovups (%6), %%ymm6\n"
+        "vmovups (%7), %%ymm7\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        "vmovups %%ymm10, (%2)\n"
+        "vmovups %%ymm11, (%3)\n"
+        "vmovups %%ymm12, (%4)\n"
+        "vmovups %%ymm13, (%5)\n"
+        "vmovups %%ymm14, (%6)\n"
+        "vmovups %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 4096; j += 4096) {
+    for (int k = 0; k < 512; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vmovups (%4), %%ymm4\n"
+        "vmovups (%5), %%ymm5\n"
+        "vmovups (%6), %%ymm6\n"
+        "vmovups (%7), %%ymm7\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        "vmovups %%ymm10, (%2)\n"
+        "vmovups %%ymm11, (%3)\n"
+        "vmovups %%ymm12, (%4)\n"
+        "vmovups %%ymm13, (%5)\n"
+        "vmovups %%ymm14, (%6)\n"
+        "vmovups %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+void helper_float_13_recursive(float *buf, int depth);
+void helper_float_13_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_float_13_recursive(buf + 0, 11);
+    helper_float_13_recursive(buf + 2048, 11);
+    helper_float_13_recursive(buf + 4096, 11);
+    helper_float_13_recursive(buf + 6144, 11);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_13(float *buf);
+void helper_float_13(float *buf) {
+  helper_float_13_recursive(buf, 13);
+}
+void helper_float_14_recursive(float *buf, int depth);
+void helper_float_14_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_14_recursive(buf + 0, 12);
+    helper_float_14_recursive(buf + 4096, 12);
+    helper_float_14_recursive(buf + 8192, 12);
+    helper_float_14_recursive(buf + 12288, 12);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_14(float *buf);
+void helper_float_14(float *buf) {
+  helper_float_14_recursive(buf, 14);
+}
+void helper_float_15_recursive(float *buf, int depth);
+void helper_float_15_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_15_recursive(buf + 0, 13);
+    helper_float_15_recursive(buf + 8192, 13);
+    helper_float_15_recursive(buf + 16384, 13);
+    helper_float_15_recursive(buf + 24576, 13);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 8192; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_15(float *buf);
+void helper_float_15(float *buf) {
+  helper_float_15_recursive(buf, 15);
+}
+void helper_float_16_recursive(float *buf, int depth);
+void helper_float_16_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_16_recursive(buf + 0, 13);
+    helper_float_16_recursive(buf + 8192, 13);
+    helper_float_16_recursive(buf + 16384, 13);
+    helper_float_16_recursive(buf + 24576, 13);
+    helper_float_16_recursive(buf + 32768, 13);
+    helper_float_16_recursive(buf + 40960, 13);
+    helper_float_16_recursive(buf + 49152, 13);
+    helper_float_16_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_16(float *buf);
+void helper_float_16(float *buf) {
+  helper_float_16_recursive(buf, 16);
+}
+void helper_float_17_recursive(float *buf, int depth);
+void helper_float_17_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_17_recursive(buf + 0, 12);
+    helper_float_17_recursive(buf + 4096, 12);
+    helper_float_17_recursive(buf + 8192, 12);
+    helper_float_17_recursive(buf + 12288, 12);
+    helper_float_17_recursive(buf + 16384, 12);
+    helper_float_17_recursive(buf + 20480, 12);
+    helper_float_17_recursive(buf + 24576, 12);
+    helper_float_17_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_17_recursive(buf + 0, 15);
+    helper_float_17_recursive(buf + 32768, 15);
+    helper_float_17_recursive(buf + 65536, 15);
+    helper_float_17_recursive(buf + 98304, 15);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_17(float *buf);
+void helper_float_17(float *buf) {
+  helper_float_17_recursive(buf, 17);
+}
+void helper_float_18_recursive(float *buf, int depth);
+void helper_float_18_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_18_recursive(buf + 0, 12);
+    helper_float_18_recursive(buf + 4096, 12);
+    helper_float_18_recursive(buf + 8192, 12);
+    helper_float_18_recursive(buf + 12288, 12);
+    helper_float_18_recursive(buf + 16384, 12);
+    helper_float_18_recursive(buf + 20480, 12);
+    helper_float_18_recursive(buf + 24576, 12);
+    helper_float_18_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_18_recursive(buf + 0, 15);
+    helper_float_18_recursive(buf + 32768, 15);
+    helper_float_18_recursive(buf + 65536, 15);
+    helper_float_18_recursive(buf + 98304, 15);
+    helper_float_18_recursive(buf + 131072, 15);
+    helper_float_18_recursive(buf + 163840, 15);
+    helper_float_18_recursive(buf + 196608, 15);
+    helper_float_18_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_18(float *buf);
+void helper_float_18(float *buf) {
+  helper_float_18_recursive(buf, 18);
+}
+void helper_float_19_recursive(float *buf, int depth);
+void helper_float_19_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_19_recursive(buf + 0, 13);
+    helper_float_19_recursive(buf + 8192, 13);
+    helper_float_19_recursive(buf + 16384, 13);
+    helper_float_19_recursive(buf + 24576, 13);
+    helper_float_19_recursive(buf + 32768, 13);
+    helper_float_19_recursive(buf + 40960, 13);
+    helper_float_19_recursive(buf + 49152, 13);
+    helper_float_19_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_float_19_recursive(buf + 0, 16);
+    helper_float_19_recursive(buf + 65536, 16);
+    helper_float_19_recursive(buf + 131072, 16);
+    helper_float_19_recursive(buf + 196608, 16);
+    helper_float_19_recursive(buf + 262144, 16);
+    helper_float_19_recursive(buf + 327680, 16);
+    helper_float_19_recursive(buf + 393216, 16);
+    helper_float_19_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_19(float *buf);
+void helper_float_19(float *buf) {
+  helper_float_19_recursive(buf, 19);
+}
+void helper_float_20_recursive(float *buf, int depth);
+void helper_float_20_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_20_recursive(buf + 0, 12);
+    helper_float_20_recursive(buf + 4096, 12);
+    helper_float_20_recursive(buf + 8192, 12);
+    helper_float_20_recursive(buf + 12288, 12);
+    helper_float_20_recursive(buf + 16384, 12);
+    helper_float_20_recursive(buf + 20480, 12);
+    helper_float_20_recursive(buf + 24576, 12);
+    helper_float_20_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_20_recursive(buf + 0, 15);
+    helper_float_20_recursive(buf + 32768, 15);
+    helper_float_20_recursive(buf + 65536, 15);
+    helper_float_20_recursive(buf + 98304, 15);
+    helper_float_20_recursive(buf + 131072, 15);
+    helper_float_20_recursive(buf + 163840, 15);
+    helper_float_20_recursive(buf + 196608, 15);
+    helper_float_20_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_float_20_recursive(buf + 0, 18);
+    helper_float_20_recursive(buf + 262144, 18);
+    helper_float_20_recursive(buf + 524288, 18);
+    helper_float_20_recursive(buf + 786432, 18);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_20(float *buf);
+void helper_float_20(float *buf) {
+  helper_float_20_recursive(buf, 20);
+}
+void helper_float_21_recursive(float *buf, int depth);
+void helper_float_21_recursive(float *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_float_21_recursive(buf + 0, 9);
+    helper_float_21_recursive(buf + 512, 9);
+    helper_float_21_recursive(buf + 1024, 9);
+    helper_float_21_recursive(buf + 1536, 9);
+    helper_float_21_recursive(buf + 2048, 9);
+    helper_float_21_recursive(buf + 2560, 9);
+    helper_float_21_recursive(buf + 3072, 9);
+    helper_float_21_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_21_recursive(buf + 0, 12);
+    helper_float_21_recursive(buf + 4096, 12);
+    helper_float_21_recursive(buf + 8192, 12);
+    helper_float_21_recursive(buf + 12288, 12);
+    helper_float_21_recursive(buf + 16384, 12);
+    helper_float_21_recursive(buf + 20480, 12);
+    helper_float_21_recursive(buf + 24576, 12);
+    helper_float_21_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_21_recursive(buf + 0, 15);
+    helper_float_21_recursive(buf + 32768, 15);
+    helper_float_21_recursive(buf + 65536, 15);
+    helper_float_21_recursive(buf + 98304, 15);
+    helper_float_21_recursive(buf + 131072, 15);
+    helper_float_21_recursive(buf + 163840, 15);
+    helper_float_21_recursive(buf + 196608, 15);
+    helper_float_21_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_21_recursive(buf + 0, 18);
+    helper_float_21_recursive(buf + 262144, 18);
+    helper_float_21_recursive(buf + 524288, 18);
+    helper_float_21_recursive(buf + 786432, 18);
+    helper_float_21_recursive(buf + 1048576, 18);
+    helper_float_21_recursive(buf + 1310720, 18);
+    helper_float_21_recursive(buf + 1572864, 18);
+    helper_float_21_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_21(float *buf);
+void helper_float_21(float *buf) {
+  helper_float_21_recursive(buf, 21);
+}
+void helper_float_22_recursive(float *buf, int depth);
+void helper_float_22_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_22_recursive(buf + 0, 11);
+    helper_float_22_recursive(buf + 2048, 11);
+    helper_float_22_recursive(buf + 4096, 11);
+    helper_float_22_recursive(buf + 6144, 11);
+    helper_float_22_recursive(buf + 8192, 11);
+    helper_float_22_recursive(buf + 10240, 11);
+    helper_float_22_recursive(buf + 12288, 11);
+    helper_float_22_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_22_recursive(buf + 0, 14);
+    helper_float_22_recursive(buf + 16384, 14);
+    helper_float_22_recursive(buf + 32768, 14);
+    helper_float_22_recursive(buf + 49152, 14);
+    helper_float_22_recursive(buf + 65536, 14);
+    helper_float_22_recursive(buf + 81920, 14);
+    helper_float_22_recursive(buf + 98304, 14);
+    helper_float_22_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_float_22_recursive(buf + 0, 17);
+    helper_float_22_recursive(buf + 131072, 17);
+    helper_float_22_recursive(buf + 262144, 17);
+    helper_float_22_recursive(buf + 393216, 17);
+    helper_float_22_recursive(buf + 524288, 17);
+    helper_float_22_recursive(buf + 655360, 17);
+    helper_float_22_recursive(buf + 786432, 17);
+    helper_float_22_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_float_22_recursive(buf + 0, 20);
+    helper_float_22_recursive(buf + 1048576, 20);
+    helper_float_22_recursive(buf + 2097152, 20);
+    helper_float_22_recursive(buf + 3145728, 20);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 1048576; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_22(float *buf);
+void helper_float_22(float *buf) {
+  helper_float_22_recursive(buf, 22);
+}
+void helper_float_23_recursive(float *buf, int depth);
+void helper_float_23_recursive(float *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_float_23_recursive(buf + 0, 9);
+    helper_float_23_recursive(buf + 512, 9);
+    helper_float_23_recursive(buf + 1024, 9);
+    helper_float_23_recursive(buf + 1536, 9);
+    helper_float_23_recursive(buf + 2048, 9);
+    helper_float_23_recursive(buf + 2560, 9);
+    helper_float_23_recursive(buf + 3072, 9);
+    helper_float_23_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_23_recursive(buf + 0, 12);
+    helper_float_23_recursive(buf + 4096, 12);
+    helper_float_23_recursive(buf + 8192, 12);
+    helper_float_23_recursive(buf + 12288, 12);
+    helper_float_23_recursive(buf + 16384, 12);
+    helper_float_23_recursive(buf + 20480, 12);
+    helper_float_23_recursive(buf + 24576, 12);
+    helper_float_23_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_23_recursive(buf + 0, 15);
+    helper_float_23_recursive(buf + 32768, 15);
+    helper_float_23_recursive(buf + 65536, 15);
+    helper_float_23_recursive(buf + 98304, 15);
+    helper_float_23_recursive(buf + 131072, 15);
+    helper_float_23_recursive(buf + 163840, 15);
+    helper_float_23_recursive(buf + 196608, 15);
+    helper_float_23_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_23_recursive(buf + 0, 18);
+    helper_float_23_recursive(buf + 262144, 18);
+    helper_float_23_recursive(buf + 524288, 18);
+    helper_float_23_recursive(buf + 786432, 18);
+    helper_float_23_recursive(buf + 1048576, 18);
+    helper_float_23_recursive(buf + 1310720, 18);
+    helper_float_23_recursive(buf + 1572864, 18);
+    helper_float_23_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_float_23_recursive(buf + 0, 21);
+    helper_float_23_recursive(buf + 2097152, 21);
+    helper_float_23_recursive(buf + 4194304, 21);
+    helper_float_23_recursive(buf + 6291456, 21);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 2097152; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_23(float *buf);
+void helper_float_23(float *buf) {
+  helper_float_23_recursive(buf, 23);
+}
+void helper_float_24_recursive(float *buf, int depth);
+void helper_float_24_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_24_recursive(buf + 0, 12);
+    helper_float_24_recursive(buf + 4096, 12);
+    helper_float_24_recursive(buf + 8192, 12);
+    helper_float_24_recursive(buf + 12288, 12);
+    helper_float_24_recursive(buf + 16384, 12);
+    helper_float_24_recursive(buf + 20480, 12);
+    helper_float_24_recursive(buf + 24576, 12);
+    helper_float_24_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_24_recursive(buf + 0, 15);
+    helper_float_24_recursive(buf + 32768, 15);
+    helper_float_24_recursive(buf + 65536, 15);
+    helper_float_24_recursive(buf + 98304, 15);
+    helper_float_24_recursive(buf + 131072, 15);
+    helper_float_24_recursive(buf + 163840, 15);
+    helper_float_24_recursive(buf + 196608, 15);
+    helper_float_24_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_24_recursive(buf + 0, 18);
+    helper_float_24_recursive(buf + 262144, 18);
+    helper_float_24_recursive(buf + 524288, 18);
+    helper_float_24_recursive(buf + 786432, 18);
+    helper_float_24_recursive(buf + 1048576, 18);
+    helper_float_24_recursive(buf + 1310720, 18);
+    helper_float_24_recursive(buf + 1572864, 18);
+    helper_float_24_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_24_recursive(buf + 0, 21);
+    helper_float_24_recursive(buf + 2097152, 21);
+    helper_float_24_recursive(buf + 4194304, 21);
+    helper_float_24_recursive(buf + 6291456, 21);
+    helper_float_24_recursive(buf + 8388608, 21);
+    helper_float_24_recursive(buf + 10485760, 21);
+    helper_float_24_recursive(buf + 12582912, 21);
+    helper_float_24_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_24(float *buf);
+void helper_float_24(float *buf) {
+  helper_float_24_recursive(buf, 24);
+}
+void helper_float_25_recursive(float *buf, int depth);
+void helper_float_25_recursive(float *buf, int depth) {
+  if (depth == 7) {
+    for (int j = 0; j < 128; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 10) {
+    helper_float_25_recursive(buf + 0, 7);
+    helper_float_25_recursive(buf + 128, 7);
+    helper_float_25_recursive(buf + 256, 7);
+    helper_float_25_recursive(buf + 384, 7);
+    helper_float_25_recursive(buf + 512, 7);
+    helper_float_25_recursive(buf + 640, 7);
+    helper_float_25_recursive(buf + 768, 7);
+    helper_float_25_recursive(buf + 896, 7);
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_float_25_recursive(buf + 0, 10);
+    helper_float_25_recursive(buf + 1024, 10);
+    helper_float_25_recursive(buf + 2048, 10);
+    helper_float_25_recursive(buf + 3072, 10);
+    helper_float_25_recursive(buf + 4096, 10);
+    helper_float_25_recursive(buf + 5120, 10);
+    helper_float_25_recursive(buf + 6144, 10);
+    helper_float_25_recursive(buf + 7168, 10);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_25_recursive(buf + 0, 13);
+    helper_float_25_recursive(buf + 8192, 13);
+    helper_float_25_recursive(buf + 16384, 13);
+    helper_float_25_recursive(buf + 24576, 13);
+    helper_float_25_recursive(buf + 32768, 13);
+    helper_float_25_recursive(buf + 40960, 13);
+    helper_float_25_recursive(buf + 49152, 13);
+    helper_float_25_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_float_25_recursive(buf + 0, 16);
+    helper_float_25_recursive(buf + 65536, 16);
+    helper_float_25_recursive(buf + 131072, 16);
+    helper_float_25_recursive(buf + 196608, 16);
+    helper_float_25_recursive(buf + 262144, 16);
+    helper_float_25_recursive(buf + 327680, 16);
+    helper_float_25_recursive(buf + 393216, 16);
+    helper_float_25_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_float_25_recursive(buf + 0, 19);
+    helper_float_25_recursive(buf + 524288, 19);
+    helper_float_25_recursive(buf + 1048576, 19);
+    helper_float_25_recursive(buf + 1572864, 19);
+    helper_float_25_recursive(buf + 2097152, 19);
+    helper_float_25_recursive(buf + 2621440, 19);
+    helper_float_25_recursive(buf + 3145728, 19);
+    helper_float_25_recursive(buf + 3670016, 19);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 524288; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 25) {
+    helper_float_25_recursive(buf + 0, 22);
+    helper_float_25_recursive(buf + 4194304, 22);
+    helper_float_25_recursive(buf + 8388608, 22);
+    helper_float_25_recursive(buf + 12582912, 22);
+    helper_float_25_recursive(buf + 16777216, 22);
+    helper_float_25_recursive(buf + 20971520, 22);
+    helper_float_25_recursive(buf + 25165824, 22);
+    helper_float_25_recursive(buf + 29360128, 22);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 4194304; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_25(float *buf);
+void helper_float_25(float *buf) {
+  helper_float_25_recursive(buf, 25);
+}
+void helper_float_26_recursive(float *buf, int depth);
+void helper_float_26_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_26_recursive(buf + 0, 12);
+    helper_float_26_recursive(buf + 4096, 12);
+    helper_float_26_recursive(buf + 8192, 12);
+    helper_float_26_recursive(buf + 12288, 12);
+    helper_float_26_recursive(buf + 16384, 12);
+    helper_float_26_recursive(buf + 20480, 12);
+    helper_float_26_recursive(buf + 24576, 12);
+    helper_float_26_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_26_recursive(buf + 0, 15);
+    helper_float_26_recursive(buf + 32768, 15);
+    helper_float_26_recursive(buf + 65536, 15);
+    helper_float_26_recursive(buf + 98304, 15);
+    helper_float_26_recursive(buf + 131072, 15);
+    helper_float_26_recursive(buf + 163840, 15);
+    helper_float_26_recursive(buf + 196608, 15);
+    helper_float_26_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_26_recursive(buf + 0, 18);
+    helper_float_26_recursive(buf + 262144, 18);
+    helper_float_26_recursive(buf + 524288, 18);
+    helper_float_26_recursive(buf + 786432, 18);
+    helper_float_26_recursive(buf + 1048576, 18);
+    helper_float_26_recursive(buf + 1310720, 18);
+    helper_float_26_recursive(buf + 1572864, 18);
+    helper_float_26_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_26_recursive(buf + 0, 21);
+    helper_float_26_recursive(buf + 2097152, 21);
+    helper_float_26_recursive(buf + 4194304, 21);
+    helper_float_26_recursive(buf + 6291456, 21);
+    helper_float_26_recursive(buf + 8388608, 21);
+    helper_float_26_recursive(buf + 10485760, 21);
+    helper_float_26_recursive(buf + 12582912, 21);
+    helper_float_26_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_float_26_recursive(buf + 0, 24);
+    helper_float_26_recursive(buf + 16777216, 24);
+    helper_float_26_recursive(buf + 33554432, 24);
+    helper_float_26_recursive(buf + 50331648, 24);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 16777216; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_26(float *buf);
+void helper_float_26(float *buf) {
+  helper_float_26_recursive(buf, 26);
+}
+void helper_float_27_recursive(float *buf, int depth);
+void helper_float_27_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_27_recursive(buf + 0, 12);
+    helper_float_27_recursive(buf + 4096, 12);
+    helper_float_27_recursive(buf + 8192, 12);
+    helper_float_27_recursive(buf + 12288, 12);
+    helper_float_27_recursive(buf + 16384, 12);
+    helper_float_27_recursive(buf + 20480, 12);
+    helper_float_27_recursive(buf + 24576, 12);
+    helper_float_27_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_27_recursive(buf + 0, 15);
+    helper_float_27_recursive(buf + 32768, 15);
+    helper_float_27_recursive(buf + 65536, 15);
+    helper_float_27_recursive(buf + 98304, 15);
+    helper_float_27_recursive(buf + 131072, 15);
+    helper_float_27_recursive(buf + 163840, 15);
+    helper_float_27_recursive(buf + 196608, 15);
+    helper_float_27_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_27_recursive(buf + 0, 18);
+    helper_float_27_recursive(buf + 262144, 18);
+    helper_float_27_recursive(buf + 524288, 18);
+    helper_float_27_recursive(buf + 786432, 18);
+    helper_float_27_recursive(buf + 1048576, 18);
+    helper_float_27_recursive(buf + 1310720, 18);
+    helper_float_27_recursive(buf + 1572864, 18);
+    helper_float_27_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_27_recursive(buf + 0, 21);
+    helper_float_27_recursive(buf + 2097152, 21);
+    helper_float_27_recursive(buf + 4194304, 21);
+    helper_float_27_recursive(buf + 6291456, 21);
+    helper_float_27_recursive(buf + 8388608, 21);
+    helper_float_27_recursive(buf + 10485760, 21);
+    helper_float_27_recursive(buf + 12582912, 21);
+    helper_float_27_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_float_27_recursive(buf + 0, 24);
+    helper_float_27_recursive(buf + 16777216, 24);
+    helper_float_27_recursive(buf + 33554432, 24);
+    helper_float_27_recursive(buf + 50331648, 24);
+    helper_float_27_recursive(buf + 67108864, 24);
+    helper_float_27_recursive(buf + 83886080, 24);
+    helper_float_27_recursive(buf + 100663296, 24);
+    helper_float_27_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_27(float *buf);
+void helper_float_27(float *buf) {
+  helper_float_27_recursive(buf, 27);
+}
+void helper_float_28_recursive(float *buf, int depth);
+void helper_float_28_recursive(float *buf, int depth) {
+  if (depth == 7) {
+    for (int j = 0; j < 128; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 10) {
+    helper_float_28_recursive(buf + 0, 7);
+    helper_float_28_recursive(buf + 128, 7);
+    helper_float_28_recursive(buf + 256, 7);
+    helper_float_28_recursive(buf + 384, 7);
+    helper_float_28_recursive(buf + 512, 7);
+    helper_float_28_recursive(buf + 640, 7);
+    helper_float_28_recursive(buf + 768, 7);
+    helper_float_28_recursive(buf + 896, 7);
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_float_28_recursive(buf + 0, 10);
+    helper_float_28_recursive(buf + 1024, 10);
+    helper_float_28_recursive(buf + 2048, 10);
+    helper_float_28_recursive(buf + 3072, 10);
+    helper_float_28_recursive(buf + 4096, 10);
+    helper_float_28_recursive(buf + 5120, 10);
+    helper_float_28_recursive(buf + 6144, 10);
+    helper_float_28_recursive(buf + 7168, 10);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_28_recursive(buf + 0, 13);
+    helper_float_28_recursive(buf + 8192, 13);
+    helper_float_28_recursive(buf + 16384, 13);
+    helper_float_28_recursive(buf + 24576, 13);
+    helper_float_28_recursive(buf + 32768, 13);
+    helper_float_28_recursive(buf + 40960, 13);
+    helper_float_28_recursive(buf + 49152, 13);
+    helper_float_28_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_float_28_recursive(buf + 0, 16);
+    helper_float_28_recursive(buf + 65536, 16);
+    helper_float_28_recursive(buf + 131072, 16);
+    helper_float_28_recursive(buf + 196608, 16);
+    helper_float_28_recursive(buf + 262144, 16);
+    helper_float_28_recursive(buf + 327680, 16);
+    helper_float_28_recursive(buf + 393216, 16);
+    helper_float_28_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_float_28_recursive(buf + 0, 19);
+    helper_float_28_recursive(buf + 524288, 19);
+    helper_float_28_recursive(buf + 1048576, 19);
+    helper_float_28_recursive(buf + 1572864, 19);
+    helper_float_28_recursive(buf + 2097152, 19);
+    helper_float_28_recursive(buf + 2621440, 19);
+    helper_float_28_recursive(buf + 3145728, 19);
+    helper_float_28_recursive(buf + 3670016, 19);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 524288; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 25) {
+    helper_float_28_recursive(buf + 0, 22);
+    helper_float_28_recursive(buf + 4194304, 22);
+    helper_float_28_recursive(buf + 8388608, 22);
+    helper_float_28_recursive(buf + 12582912, 22);
+    helper_float_28_recursive(buf + 16777216, 22);
+    helper_float_28_recursive(buf + 20971520, 22);
+    helper_float_28_recursive(buf + 25165824, 22);
+    helper_float_28_recursive(buf + 29360128, 22);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 4194304; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 28) {
+    helper_float_28_recursive(buf + 0, 25);
+    helper_float_28_recursive(buf + 33554432, 25);
+    helper_float_28_recursive(buf + 67108864, 25);
+    helper_float_28_recursive(buf + 100663296, 25);
+    helper_float_28_recursive(buf + 134217728, 25);
+    helper_float_28_recursive(buf + 167772160, 25);
+    helper_float_28_recursive(buf + 201326592, 25);
+    helper_float_28_recursive(buf + 234881024, 25);
+    for (int j = 0; j < 268435456; j += 268435456) {
+      for (int k = 0; k < 33554432; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 33554432), "r"(buf + j + k + 67108864), "r"(buf + j + k + 100663296), "r"(buf + j + k + 134217728), "r"(buf + j + k + 167772160), "r"(buf + j + k + 201326592), "r"(buf + j + k + 234881024) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_28(float *buf);
+void helper_float_28(float *buf) {
+  helper_float_28_recursive(buf, 28);
+}
+void helper_float_29_recursive(float *buf, int depth);
+void helper_float_29_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_29_recursive(buf + 0, 12);
+    helper_float_29_recursive(buf + 4096, 12);
+    helper_float_29_recursive(buf + 8192, 12);
+    helper_float_29_recursive(buf + 12288, 12);
+    helper_float_29_recursive(buf + 16384, 12);
+    helper_float_29_recursive(buf + 20480, 12);
+    helper_float_29_recursive(buf + 24576, 12);
+    helper_float_29_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_29_recursive(buf + 0, 15);
+    helper_float_29_recursive(buf + 32768, 15);
+    helper_float_29_recursive(buf + 65536, 15);
+    helper_float_29_recursive(buf + 98304, 15);
+    helper_float_29_recursive(buf + 131072, 15);
+    helper_float_29_recursive(buf + 163840, 15);
+    helper_float_29_recursive(buf + 196608, 15);
+    helper_float_29_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_29_recursive(buf + 0, 18);
+    helper_float_29_recursive(buf + 262144, 18);
+    helper_float_29_recursive(buf + 524288, 18);
+    helper_float_29_recursive(buf + 786432, 18);
+    helper_float_29_recursive(buf + 1048576, 18);
+    helper_float_29_recursive(buf + 1310720, 18);
+    helper_float_29_recursive(buf + 1572864, 18);
+    helper_float_29_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_29_recursive(buf + 0, 21);
+    helper_float_29_recursive(buf + 2097152, 21);
+    helper_float_29_recursive(buf + 4194304, 21);
+    helper_float_29_recursive(buf + 6291456, 21);
+    helper_float_29_recursive(buf + 8388608, 21);
+    helper_float_29_recursive(buf + 10485760, 21);
+    helper_float_29_recursive(buf + 12582912, 21);
+    helper_float_29_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_float_29_recursive(buf + 0, 24);
+    helper_float_29_recursive(buf + 16777216, 24);
+    helper_float_29_recursive(buf + 33554432, 24);
+    helper_float_29_recursive(buf + 50331648, 24);
+    helper_float_29_recursive(buf + 67108864, 24);
+    helper_float_29_recursive(buf + 83886080, 24);
+    helper_float_29_recursive(buf + 100663296, 24);
+    helper_float_29_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 29) {
+    helper_float_29_recursive(buf + 0, 27);
+    helper_float_29_recursive(buf + 134217728, 27);
+    helper_float_29_recursive(buf + 268435456, 27);
+    helper_float_29_recursive(buf + 402653184, 27);
+    for (int j = 0; j < 536870912; j += 536870912) {
+      for (int k = 0; k < 134217728; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_29(float *buf);
+void helper_float_29(float *buf) {
+  helper_float_29_recursive(buf, 29);
+}
+void helper_float_30_recursive(float *buf, int depth);
+void helper_float_30_recursive(float *buf, int depth) {
+  if (depth == 6) {
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 9) {
+    helper_float_30_recursive(buf + 0, 6);
+    helper_float_30_recursive(buf + 64, 6);
+    helper_float_30_recursive(buf + 128, 6);
+    helper_float_30_recursive(buf + 192, 6);
+    helper_float_30_recursive(buf + 256, 6);
+    helper_float_30_recursive(buf + 320, 6);
+    helper_float_30_recursive(buf + 384, 6);
+    helper_float_30_recursive(buf + 448, 6);
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_float_30_recursive(buf + 0, 9);
+    helper_float_30_recursive(buf + 512, 9);
+    helper_float_30_recursive(buf + 1024, 9);
+    helper_float_30_recursive(buf + 1536, 9);
+    helper_float_30_recursive(buf + 2048, 9);
+    helper_float_30_recursive(buf + 2560, 9);
+    helper_float_30_recursive(buf + 3072, 9);
+    helper_float_30_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_30_recursive(buf + 0, 12);
+    helper_float_30_recursive(buf + 4096, 12);
+    helper_float_30_recursive(buf + 8192, 12);
+    helper_float_30_recursive(buf + 12288, 12);
+    helper_float_30_recursive(buf + 16384, 12);
+    helper_float_30_recursive(buf + 20480, 12);
+    helper_float_30_recursive(buf + 24576, 12);
+    helper_float_30_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_30_recursive(buf + 0, 15);
+    helper_float_30_recursive(buf + 32768, 15);
+    helper_float_30_recursive(buf + 65536, 15);
+    helper_float_30_recursive(buf + 98304, 15);
+    helper_float_30_recursive(buf + 131072, 15);
+    helper_float_30_recursive(buf + 163840, 15);
+    helper_float_30_recursive(buf + 196608, 15);
+    helper_float_30_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_30_recursive(buf + 0, 18);
+    helper_float_30_recursive(buf + 262144, 18);
+    helper_float_30_recursive(buf + 524288, 18);
+    helper_float_30_recursive(buf + 786432, 18);
+    helper_float_30_recursive(buf + 1048576, 18);
+    helper_float_30_recursive(buf + 1310720, 18);
+    helper_float_30_recursive(buf + 1572864, 18);
+    helper_float_30_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_30_recursive(buf + 0, 21);
+    helper_float_30_recursive(buf + 2097152, 21);
+    helper_float_30_recursive(buf + 4194304, 21);
+    helper_float_30_recursive(buf + 6291456, 21);
+    helper_float_30_recursive(buf + 8388608, 21);
+    helper_float_30_recursive(buf + 10485760, 21);
+    helper_float_30_recursive(buf + 12582912, 21);
+    helper_float_30_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_float_30_recursive(buf + 0, 24);
+    helper_float_30_recursive(buf + 16777216, 24);
+    helper_float_30_recursive(buf + 33554432, 24);
+    helper_float_30_recursive(buf + 50331648, 24);
+    helper_float_30_recursive(buf + 67108864, 24);
+    helper_float_30_recursive(buf + 83886080, 24);
+    helper_float_30_recursive(buf + 100663296, 24);
+    helper_float_30_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 30) {
+    helper_float_30_recursive(buf + 0, 27);
+    helper_float_30_recursive(buf + 134217728, 27);
+    helper_float_30_recursive(buf + 268435456, 27);
+    helper_float_30_recursive(buf + 402653184, 27);
+    helper_float_30_recursive(buf + 536870912, 27);
+    helper_float_30_recursive(buf + 671088640, 27);
+    helper_float_30_recursive(buf + 805306368, 27);
+    helper_float_30_recursive(buf + 939524096, 27);
+    for (int j = 0; j < 1073741824; j += 1073741824) {
+      for (int k = 0; k < 134217728; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_30(float *buf);
+void helper_float_30(float *buf) {
+  helper_float_30_recursive(buf, 30);
+}
+int fht_float(float *buf, int log_n) {
+  if (log_n == 0) {
+    return 0;
+  }
+  if (log_n == 1) {
+    helper_float_1(buf);
+    return 0;
+  }
+  if (log_n == 2) {
+    helper_float_2(buf);
+    return 0;
+  }
+  if (log_n == 3) {
+    helper_float_3(buf);
+    return 0;
+  }
+  if (log_n == 4) {
+    helper_float_4(buf);
+    return 0;
+  }
+  if (log_n == 5) {
+    helper_float_5(buf);
+    return 0;
+  }
+  if (log_n == 6) {
+    helper_float_6(buf);
+    return 0;
+  }
+  if (log_n == 7) {
+    helper_float_7(buf);
+    return 0;
+  }
+  if (log_n == 8) {
+    helper_float_8(buf);
+    return 0;
+  }
+  if (log_n == 9) {
+    helper_float_9(buf);
+    return 0;
+  }
+  if (log_n == 10) {
+    helper_float_10(buf);
+    return 0;
+  }
+  if (log_n == 11) {
+    helper_float_11(buf);
+    return 0;
+  }
+  if (log_n == 12) {
+    helper_float_12(buf);
+    return 0;
+  }
+  if (log_n == 13) {
+    helper_float_13(buf);
+    return 0;
+  }
+  if (log_n == 14) {
+    helper_float_14(buf);
+    return 0;
+  }
+  if (log_n == 15) {
+    helper_float_15(buf);
+    return 0;
+  }
+  if (log_n == 16) {
+    helper_float_16(buf);
+    return 0;
+  }
+  if (log_n == 17) {
+    helper_float_17(buf);
+    return 0;
+  }
+  if (log_n == 18) {
+    helper_float_18(buf);
+    return 0;
+  }
+  if (log_n == 19) {
+    helper_float_19(buf);
+    return 0;
+  }
+  if (log_n == 20) {
+    helper_float_20(buf);
+    return 0;
+  }
+  if (log_n == 21) {
+    helper_float_21(buf);
+    return 0;
+  }
+  if (log_n == 22) {
+    helper_float_22(buf);
+    return 0;
+  }
+  if (log_n == 23) {
+    helper_float_23(buf);
+    return 0;
+  }
+  if (log_n == 24) {
+    helper_float_24(buf);
+    return 0;
+  }
+  if (log_n == 25) {
+    helper_float_25(buf);
+    return 0;
+  }
+  if (log_n == 26) {
+    helper_float_26(buf);
+    return 0;
+  }
+  if (log_n == 27) {
+    helper_float_27(buf);
+    return 0;
+  }
+  if (log_n == 28) {
+    helper_float_28(buf);
+    return 0;
+  }
+  if (log_n == 29) {
+    helper_float_29(buf);
+    return 0;
+  }
+  if (log_n == 30) {
+    helper_float_30(buf);
+    return 0;
+  }
+  return 1;
+}
+static inline void helper_double_1(double *buf);
+static inline void helper_double_1(double *buf) {
+  for (int j = 0; j < 2; j += 2) {
+    for (int k = 0; k < 1; ++k) {
+      double u = buf[j + k];
+      double v = buf[j + k + 1];
+      buf[j + k] = u + v;
+      buf[j + k + 1] = u - v;
+    }
+  }
+}
+static inline void helper_double_2(double *buf);
+static inline void helper_double_2(double *buf) {
+  for (int j = 0; j < 4; j += 4) {
+    __asm__ volatile (
+      "vmovupd (%0), %%ymm0\n"
+      "vpermilpd $0, %%ymm0, %%ymm8\n"
+      "vpermilpd $15, %%ymm0, %%ymm9\n"
+      "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+      "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+      "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+      "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+      "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+      "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+      "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+      "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+      "vmovupd %%ymm0, (%0)\n"
+      :: "r"(buf + j) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+    );
+  }
+}
+static inline void helper_double_3(double *buf);
+static inline void helper_double_3(double *buf) {
+  for (int j = 0; j < 8; j += 8) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+void helper_double_4_recursive(double *buf, int depth);
+void helper_double_4_recursive(double *buf, int depth) {
+  if (depth == 4) {
+    for (int j = 0; j < 16; j += 16) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_4(double *buf);
+void helper_double_4(double *buf) {
+  helper_double_4_recursive(buf, 4);
+}
+static inline void helper_double_5(double *buf);
+static inline void helper_double_5(double *buf) {
+  for (int j = 0; j < 32; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_6(double *buf);
+static inline void helper_double_6(double *buf) {
+  for (int j = 0; j < 64; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 64; j += 64) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_7(double *buf);
+static inline void helper_double_7(double *buf) {
+  for (int j = 0; j < 128; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 128; j += 128) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vmovupd %%ymm0, (%0)\n"
+        "vmovupd %%ymm1, (%1)\n"
+        "vmovupd %%ymm2, (%2)\n"
+        "vmovupd %%ymm3, (%3)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_8(double *buf);
+static inline void helper_double_8(double *buf) {
+  for (int j = 0; j < 256; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 256; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_9(double *buf);
+static inline void helper_double_9(double *buf) {
+  for (int j = 0; j < 512; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 512; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 512; j += 512) {
+    for (int k = 0; k < 256; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_10(double *buf);
+static inline void helper_double_10(double *buf) {
+  for (int j = 0; j < 1024; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 1024; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 1024; j += 1024) {
+    for (int k = 0; k < 256; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vmovupd %%ymm0, (%0)\n"
+        "vmovupd %%ymm1, (%1)\n"
+        "vmovupd %%ymm2, (%2)\n"
+        "vmovupd %%ymm3, (%3)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_11(double *buf);
+static inline void helper_double_11(double *buf) {
+  for (int j = 0; j < 2048; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 2048; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 2048; j += 2048) {
+    for (int k = 0; k < 256; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+void helper_double_12_recursive(double *buf, int depth);
+void helper_double_12_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_12_recursive(buf + 0, 11);
+    helper_double_12_recursive(buf + 2048, 11);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_12(double *buf);
+void helper_double_12(double *buf) {
+  helper_double_12_recursive(buf, 12);
+}
+void helper_double_13_recursive(double *buf, int depth);
+void helper_double_13_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_double_13_recursive(buf + 0, 11);
+    helper_double_13_recursive(buf + 2048, 11);
+    helper_double_13_recursive(buf + 4096, 11);
+    helper_double_13_recursive(buf + 6144, 11);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_13(double *buf);
+void helper_double_13(double *buf) {
+  helper_double_13_recursive(buf, 13);
+}
+void helper_double_14_recursive(double *buf, int depth);
+void helper_double_14_recursive(double *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_14_recursive(buf + 0, 12);
+    helper_double_14_recursive(buf + 4096, 12);
+    helper_double_14_recursive(buf + 8192, 12);
+    helper_double_14_recursive(buf + 12288, 12);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_14(double *buf);
+void helper_double_14(double *buf) {
+  helper_double_14_recursive(buf, 14);
+}
+void helper_double_15_recursive(double *buf, int depth);
+void helper_double_15_recursive(double *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_15_recursive(buf + 0, 12);
+    helper_double_15_recursive(buf + 4096, 12);
+    helper_double_15_recursive(buf + 8192, 12);
+    helper_double_15_recursive(buf + 12288, 12);
+    helper_double_15_recursive(buf + 16384, 12);
+    helper_double_15_recursive(buf + 20480, 12);
+    helper_double_15_recursive(buf + 24576, 12);
+    helper_double_15_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_15(double *buf);
+void helper_double_15(double *buf) {
+  helper_double_15_recursive(buf, 15);
+}
+void helper_double_16_recursive(double *buf, int depth);
+void helper_double_16_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_16_recursive(buf + 0, 11);
+    helper_double_16_recursive(buf + 2048, 11);
+    helper_double_16_recursive(buf + 4096, 11);
+    helper_double_16_recursive(buf + 6144, 11);
+    helper_double_16_recursive(buf + 8192, 11);
+    helper_double_16_recursive(buf + 10240, 11);
+    helper_double_16_recursive(buf + 12288, 11);
+    helper_double_16_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_16_recursive(buf + 0, 14);
+    helper_double_16_recursive(buf + 16384, 14);
+    helper_double_16_recursive(buf + 32768, 14);
+    helper_double_16_recursive(buf + 49152, 14);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_16(double *buf);
+void helper_double_16(double *buf) {
+  helper_double_16_recursive(buf, 16);
+}
+void helper_double_17_recursive(double *buf, int depth);
+void helper_double_17_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_17_recursive(buf + 0, 11);
+    helper_double_17_recursive(buf + 2048, 11);
+    helper_double_17_recursive(buf + 4096, 11);
+    helper_double_17_recursive(buf + 6144, 11);
+    helper_double_17_recursive(buf + 8192, 11);
+    helper_double_17_recursive(buf + 10240, 11);
+    helper_double_17_recursive(buf + 12288, 11);
+    helper_double_17_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_17_recursive(buf + 0, 14);
+    helper_double_17_recursive(buf + 16384, 14);
+    helper_double_17_recursive(buf + 32768, 14);
+    helper_double_17_recursive(buf + 49152, 14);
+    helper_double_17_recursive(buf + 65536, 14);
+    helper_double_17_recursive(buf + 81920, 14);
+    helper_double_17_recursive(buf + 98304, 14);
+    helper_double_17_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_17(double *buf);
+void helper_double_17(double *buf) {
+  helper_double_17_recursive(buf, 17);
+}
+void helper_double_18_recursive(double *buf, int depth);
+void helper_double_18_recursive(double *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_18_recursive(buf + 0, 12);
+    helper_double_18_recursive(buf + 4096, 12);
+    helper_double_18_recursive(buf + 8192, 12);
+    helper_double_18_recursive(buf + 12288, 12);
+    helper_double_18_recursive(buf + 16384, 12);
+    helper_double_18_recursive(buf + 20480, 12);
+    helper_double_18_recursive(buf + 24576, 12);
+    helper_double_18_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_18_recursive(buf + 0, 15);
+    helper_double_18_recursive(buf + 32768, 15);
+    helper_double_18_recursive(buf + 65536, 15);
+    helper_double_18_recursive(buf + 98304, 15);
+    helper_double_18_recursive(buf + 131072, 15);
+    helper_double_18_recursive(buf + 163840, 15);
+    helper_double_18_recursive(buf + 196608, 15);
+    helper_double_18_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_18(double *buf);
+void helper_double_18(double *buf) {
+  helper_double_18_recursive(buf, 18);
+}
+void helper_double_19_recursive(double *buf, int depth);
+void helper_double_19_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_19_recursive(buf + 0, 11);
+    helper_double_19_recursive(buf + 2048, 11);
+    helper_double_19_recursive(buf + 4096, 11);
+    helper_double_19_recursive(buf + 6144, 11);
+    helper_double_19_recursive(buf + 8192, 11);
+    helper_double_19_recursive(buf + 10240, 11);
+    helper_double_19_recursive(buf + 12288, 11);
+    helper_double_19_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_19_recursive(buf + 0, 14);
+    helper_double_19_recursive(buf + 16384, 14);
+    helper_double_19_recursive(buf + 32768, 14);
+    helper_double_19_recursive(buf + 49152, 14);
+    helper_double_19_recursive(buf + 65536, 14);
+    helper_double_19_recursive(buf + 81920, 14);
+    helper_double_19_recursive(buf + 98304, 14);
+    helper_double_19_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_19_recursive(buf + 0, 17);
+    helper_double_19_recursive(buf + 131072, 17);
+    helper_double_19_recursive(buf + 262144, 17);
+    helper_double_19_recursive(buf + 393216, 17);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_19(double *buf);
+void helper_double_19(double *buf) {
+  helper_double_19_recursive(buf, 19);
+}
+void helper_double_20_recursive(double *buf, int depth);
+void helper_double_20_recursive(double *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_20_recursive(buf + 0, 9);
+    helper_double_20_recursive(buf + 512, 9);
+    helper_double_20_recursive(buf + 1024, 9);
+    helper_double_20_recursive(buf + 1536, 9);
+    helper_double_20_recursive(buf + 2048, 9);
+    helper_double_20_recursive(buf + 2560, 9);
+    helper_double_20_recursive(buf + 3072, 9);
+    helper_double_20_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_20_recursive(buf + 0, 12);
+    helper_double_20_recursive(buf + 4096, 12);
+    helper_double_20_recursive(buf + 8192, 12);
+    helper_double_20_recursive(buf + 12288, 12);
+    helper_double_20_recursive(buf + 16384, 12);
+    helper_double_20_recursive(buf + 20480, 12);
+    helper_double_20_recursive(buf + 24576, 12);
+    helper_double_20_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_20_recursive(buf + 0, 15);
+    helper_double_20_recursive(buf + 32768, 15);
+    helper_double_20_recursive(buf + 65536, 15);
+    helper_double_20_recursive(buf + 98304, 15);
+    helper_double_20_recursive(buf + 131072, 15);
+    helper_double_20_recursive(buf + 163840, 15);
+    helper_double_20_recursive(buf + 196608, 15);
+    helper_double_20_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_20_recursive(buf + 0, 18);
+    helper_double_20_recursive(buf + 262144, 18);
+    helper_double_20_recursive(buf + 524288, 18);
+    helper_double_20_recursive(buf + 786432, 18);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_20(double *buf);
+void helper_double_20(double *buf) {
+  helper_double_20_recursive(buf, 20);
+}
+void helper_double_21_recursive(double *buf, int depth);
+void helper_double_21_recursive(double *buf, int depth) {
+  if (depth == 7) {
+    for (int j = 0; j < 128; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 10) {
+    helper_double_21_recursive(buf + 0, 7);
+    helper_double_21_recursive(buf + 128, 7);
+    helper_double_21_recursive(buf + 256, 7);
+    helper_double_21_recursive(buf + 384, 7);
+    helper_double_21_recursive(buf + 512, 7);
+    helper_double_21_recursive(buf + 640, 7);
+    helper_double_21_recursive(buf + 768, 7);
+    helper_double_21_recursive(buf + 896, 7);
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_double_21_recursive(buf + 0, 10);
+    helper_double_21_recursive(buf + 1024, 10);
+    helper_double_21_recursive(buf + 2048, 10);
+    helper_double_21_recursive(buf + 3072, 10);
+    helper_double_21_recursive(buf + 4096, 10);
+    helper_double_21_recursive(buf + 5120, 10);
+    helper_double_21_recursive(buf + 6144, 10);
+    helper_double_21_recursive(buf + 7168, 10);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_21_recursive(buf + 0, 13);
+    helper_double_21_recursive(buf + 8192, 13);
+    helper_double_21_recursive(buf + 16384, 13);
+    helper_double_21_recursive(buf + 24576, 13);
+    helper_double_21_recursive(buf + 32768, 13);
+    helper_double_21_recursive(buf + 40960, 13);
+    helper_double_21_recursive(buf + 49152, 13);
+    helper_double_21_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_21_recursive(buf + 0, 16);
+    helper_double_21_recursive(buf + 65536, 16);
+    helper_double_21_recursive(buf + 131072, 16);
+    helper_double_21_recursive(buf + 196608, 16);
+    helper_double_21_recursive(buf + 262144, 16);
+    helper_double_21_recursive(buf + 327680, 16);
+    helper_double_21_recursive(buf + 393216, 16);
+    helper_double_21_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_21_recursive(buf + 0, 19);
+    helper_double_21_recursive(buf + 524288, 19);
+    helper_double_21_recursive(buf + 1048576, 19);
+    helper_double_21_recursive(buf + 1572864, 19);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 524288; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_21(double *buf);
+void helper_double_21(double *buf) {
+  helper_double_21_recursive(buf, 21);
+}
+void helper_double_22_recursive(double *buf, int depth);
+void helper_double_22_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_22_recursive(buf + 0, 11);
+    helper_double_22_recursive(buf + 2048, 11);
+    helper_double_22_recursive(buf + 4096, 11);
+    helper_double_22_recursive(buf + 6144, 11);
+    helper_double_22_recursive(buf + 8192, 11);
+    helper_double_22_recursive(buf + 10240, 11);
+    helper_double_22_recursive(buf + 12288, 11);
+    helper_double_22_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_22_recursive(buf + 0, 14);
+    helper_double_22_recursive(buf + 16384, 14);
+    helper_double_22_recursive(buf + 32768, 14);
+    helper_double_22_recursive(buf + 49152, 14);
+    helper_double_22_recursive(buf + 65536, 14);
+    helper_double_22_recursive(buf + 81920, 14);
+    helper_double_22_recursive(buf + 98304, 14);
+    helper_double_22_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_22_recursive(buf + 0, 17);
+    helper_double_22_recursive(buf + 131072, 17);
+    helper_double_22_recursive(buf + 262144, 17);
+    helper_double_22_recursive(buf + 393216, 17);
+    helper_double_22_recursive(buf + 524288, 17);
+    helper_double_22_recursive(buf + 655360, 17);
+    helper_double_22_recursive(buf + 786432, 17);
+    helper_double_22_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_double_22_recursive(buf + 0, 20);
+    helper_double_22_recursive(buf + 1048576, 20);
+    helper_double_22_recursive(buf + 2097152, 20);
+    helper_double_22_recursive(buf + 3145728, 20);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_22(double *buf);
+void helper_double_22(double *buf) {
+  helper_double_22_recursive(buf, 22);
+}
+void helper_double_23_recursive(double *buf, int depth);
+void helper_double_23_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_23_recursive(buf + 0, 11);
+    helper_double_23_recursive(buf + 2048, 11);
+    helper_double_23_recursive(buf + 4096, 11);
+    helper_double_23_recursive(buf + 6144, 11);
+    helper_double_23_recursive(buf + 8192, 11);
+    helper_double_23_recursive(buf + 10240, 11);
+    helper_double_23_recursive(buf + 12288, 11);
+    helper_double_23_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_23_recursive(buf + 0, 14);
+    helper_double_23_recursive(buf + 16384, 14);
+    helper_double_23_recursive(buf + 32768, 14);
+    helper_double_23_recursive(buf + 49152, 14);
+    helper_double_23_recursive(buf + 65536, 14);
+    helper_double_23_recursive(buf + 81920, 14);
+    helper_double_23_recursive(buf + 98304, 14);
+    helper_double_23_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_23_recursive(buf + 0, 17);
+    helper_double_23_recursive(buf + 131072, 17);
+    helper_double_23_recursive(buf + 262144, 17);
+    helper_double_23_recursive(buf + 393216, 17);
+    helper_double_23_recursive(buf + 524288, 17);
+    helper_double_23_recursive(buf + 655360, 17);
+    helper_double_23_recursive(buf + 786432, 17);
+    helper_double_23_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_23_recursive(buf + 0, 20);
+    helper_double_23_recursive(buf + 1048576, 20);
+    helper_double_23_recursive(buf + 2097152, 20);
+    helper_double_23_recursive(buf + 3145728, 20);
+    helper_double_23_recursive(buf + 4194304, 20);
+    helper_double_23_recursive(buf + 5242880, 20);
+    helper_double_23_recursive(buf + 6291456, 20);
+    helper_double_23_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_23(double *buf);
+void helper_double_23(double *buf) {
+  helper_double_23_recursive(buf, 23);
+}
+void helper_double_24_recursive(double *buf, int depth);
+void helper_double_24_recursive(double *buf, int depth) {
+  if (depth == 10) {
+    for (int j = 0; j < 1024; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_double_24_recursive(buf + 0, 10);
+    helper_double_24_recursive(buf + 1024, 10);
+    helper_double_24_recursive(buf + 2048, 10);
+    helper_double_24_recursive(buf + 3072, 10);
+    helper_double_24_recursive(buf + 4096, 10);
+    helper_double_24_recursive(buf + 5120, 10);
+    helper_double_24_recursive(buf + 6144, 10);
+    helper_double_24_recursive(buf + 7168, 10);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_24_recursive(buf + 0, 13);
+    helper_double_24_recursive(buf + 8192, 13);
+    helper_double_24_recursive(buf + 16384, 13);
+    helper_double_24_recursive(buf + 24576, 13);
+    helper_double_24_recursive(buf + 32768, 13);
+    helper_double_24_recursive(buf + 40960, 13);
+    helper_double_24_recursive(buf + 49152, 13);
+    helper_double_24_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_24_recursive(buf + 0, 16);
+    helper_double_24_recursive(buf + 65536, 16);
+    helper_double_24_recursive(buf + 131072, 16);
+    helper_double_24_recursive(buf + 196608, 16);
+    helper_double_24_recursive(buf + 262144, 16);
+    helper_double_24_recursive(buf + 327680, 16);
+    helper_double_24_recursive(buf + 393216, 16);
+    helper_double_24_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_double_24_recursive(buf + 0, 19);
+    helper_double_24_recursive(buf + 524288, 19);
+    helper_double_24_recursive(buf + 1048576, 19);
+    helper_double_24_recursive(buf + 1572864, 19);
+    helper_double_24_recursive(buf + 2097152, 19);
+    helper_double_24_recursive(buf + 2621440, 19);
+    helper_double_24_recursive(buf + 3145728, 19);
+    helper_double_24_recursive(buf + 3670016, 19);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 524288; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_24_recursive(buf + 0, 22);
+    helper_double_24_recursive(buf + 4194304, 22);
+    helper_double_24_recursive(buf + 8388608, 22);
+    helper_double_24_recursive(buf + 12582912, 22);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 4194304; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_24(double *buf);
+void helper_double_24(double *buf) {
+  helper_double_24_recursive(buf, 24);
+}
+void helper_double_25_recursive(double *buf, int depth);
+void helper_double_25_recursive(double *buf, int depth) {
+  if (depth == 8) {
+    for (int j = 0; j < 256; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_25_recursive(buf + 0, 8);
+    helper_double_25_recursive(buf + 256, 8);
+    helper_double_25_recursive(buf + 512, 8);
+    helper_double_25_recursive(buf + 768, 8);
+    helper_double_25_recursive(buf + 1024, 8);
+    helper_double_25_recursive(buf + 1280, 8);
+    helper_double_25_recursive(buf + 1536, 8);
+    helper_double_25_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_25_recursive(buf + 0, 11);
+    helper_double_25_recursive(buf + 2048, 11);
+    helper_double_25_recursive(buf + 4096, 11);
+    helper_double_25_recursive(buf + 6144, 11);
+    helper_double_25_recursive(buf + 8192, 11);
+    helper_double_25_recursive(buf + 10240, 11);
+    helper_double_25_recursive(buf + 12288, 11);
+    helper_double_25_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_25_recursive(buf + 0, 14);
+    helper_double_25_recursive(buf + 16384, 14);
+    helper_double_25_recursive(buf + 32768, 14);
+    helper_double_25_recursive(buf + 49152, 14);
+    helper_double_25_recursive(buf + 65536, 14);
+    helper_double_25_recursive(buf + 81920, 14);
+    helper_double_25_recursive(buf + 98304, 14);
+    helper_double_25_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_25_recursive(buf + 0, 17);
+    helper_double_25_recursive(buf + 131072, 17);
+    helper_double_25_recursive(buf + 262144, 17);
+    helper_double_25_recursive(buf + 393216, 17);
+    helper_double_25_recursive(buf + 524288, 17);
+    helper_double_25_recursive(buf + 655360, 17);
+    helper_double_25_recursive(buf + 786432, 17);
+    helper_double_25_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_25_recursive(buf + 0, 20);
+    helper_double_25_recursive(buf + 1048576, 20);
+    helper_double_25_recursive(buf + 2097152, 20);
+    helper_double_25_recursive(buf + 3145728, 20);
+    helper_double_25_recursive(buf + 4194304, 20);
+    helper_double_25_recursive(buf + 5242880, 20);
+    helper_double_25_recursive(buf + 6291456, 20);
+    helper_double_25_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 25) {
+    helper_double_25_recursive(buf + 0, 23);
+    helper_double_25_recursive(buf + 8388608, 23);
+    helper_double_25_recursive(buf + 16777216, 23);
+    helper_double_25_recursive(buf + 25165824, 23);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_25(double *buf);
+void helper_double_25(double *buf) {
+  helper_double_25_recursive(buf, 25);
+}
+void helper_double_26_recursive(double *buf, int depth);
+void helper_double_26_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_26_recursive(buf + 0, 11);
+    helper_double_26_recursive(buf + 2048, 11);
+    helper_double_26_recursive(buf + 4096, 11);
+    helper_double_26_recursive(buf + 6144, 11);
+    helper_double_26_recursive(buf + 8192, 11);
+    helper_double_26_recursive(buf + 10240, 11);
+    helper_double_26_recursive(buf + 12288, 11);
+    helper_double_26_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_26_recursive(buf + 0, 14);
+    helper_double_26_recursive(buf + 16384, 14);
+    helper_double_26_recursive(buf + 32768, 14);
+    helper_double_26_recursive(buf + 49152, 14);
+    helper_double_26_recursive(buf + 65536, 14);
+    helper_double_26_recursive(buf + 81920, 14);
+    helper_double_26_recursive(buf + 98304, 14);
+    helper_double_26_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_26_recursive(buf + 0, 17);
+    helper_double_26_recursive(buf + 131072, 17);
+    helper_double_26_recursive(buf + 262144, 17);
+    helper_double_26_recursive(buf + 393216, 17);
+    helper_double_26_recursive(buf + 524288, 17);
+    helper_double_26_recursive(buf + 655360, 17);
+    helper_double_26_recursive(buf + 786432, 17);
+    helper_double_26_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_26_recursive(buf + 0, 20);
+    helper_double_26_recursive(buf + 1048576, 20);
+    helper_double_26_recursive(buf + 2097152, 20);
+    helper_double_26_recursive(buf + 3145728, 20);
+    helper_double_26_recursive(buf + 4194304, 20);
+    helper_double_26_recursive(buf + 5242880, 20);
+    helper_double_26_recursive(buf + 6291456, 20);
+    helper_double_26_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_double_26_recursive(buf + 0, 23);
+    helper_double_26_recursive(buf + 8388608, 23);
+    helper_double_26_recursive(buf + 16777216, 23);
+    helper_double_26_recursive(buf + 25165824, 23);
+    helper_double_26_recursive(buf + 33554432, 23);
+    helper_double_26_recursive(buf + 41943040, 23);
+    helper_double_26_recursive(buf + 50331648, 23);
+    helper_double_26_recursive(buf + 58720256, 23);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_26(double *buf);
+void helper_double_26(double *buf) {
+  helper_double_26_recursive(buf, 26);
+}
+void helper_double_27_recursive(double *buf, int depth);
+void helper_double_27_recursive(double *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_27_recursive(buf + 0, 9);
+    helper_double_27_recursive(buf + 512, 9);
+    helper_double_27_recursive(buf + 1024, 9);
+    helper_double_27_recursive(buf + 1536, 9);
+    helper_double_27_recursive(buf + 2048, 9);
+    helper_double_27_recursive(buf + 2560, 9);
+    helper_double_27_recursive(buf + 3072, 9);
+    helper_double_27_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_27_recursive(buf + 0, 12);
+    helper_double_27_recursive(buf + 4096, 12);
+    helper_double_27_recursive(buf + 8192, 12);
+    helper_double_27_recursive(buf + 12288, 12);
+    helper_double_27_recursive(buf + 16384, 12);
+    helper_double_27_recursive(buf + 20480, 12);
+    helper_double_27_recursive(buf + 24576, 12);
+    helper_double_27_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_27_recursive(buf + 0, 15);
+    helper_double_27_recursive(buf + 32768, 15);
+    helper_double_27_recursive(buf + 65536, 15);
+    helper_double_27_recursive(buf + 98304, 15);
+    helper_double_27_recursive(buf + 131072, 15);
+    helper_double_27_recursive(buf + 163840, 15);
+    helper_double_27_recursive(buf + 196608, 15);
+    helper_double_27_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_27_recursive(buf + 0, 18);
+    helper_double_27_recursive(buf + 262144, 18);
+    helper_double_27_recursive(buf + 524288, 18);
+    helper_double_27_recursive(buf + 786432, 18);
+    helper_double_27_recursive(buf + 1048576, 18);
+    helper_double_27_recursive(buf + 1310720, 18);
+    helper_double_27_recursive(buf + 1572864, 18);
+    helper_double_27_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_27_recursive(buf + 0, 21);
+    helper_double_27_recursive(buf + 2097152, 21);
+    helper_double_27_recursive(buf + 4194304, 21);
+    helper_double_27_recursive(buf + 6291456, 21);
+    helper_double_27_recursive(buf + 8388608, 21);
+    helper_double_27_recursive(buf + 10485760, 21);
+    helper_double_27_recursive(buf + 12582912, 21);
+    helper_double_27_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_double_27_recursive(buf + 0, 24);
+    helper_double_27_recursive(buf + 16777216, 24);
+    helper_double_27_recursive(buf + 33554432, 24);
+    helper_double_27_recursive(buf + 50331648, 24);
+    helper_double_27_recursive(buf + 67108864, 24);
+    helper_double_27_recursive(buf + 83886080, 24);
+    helper_double_27_recursive(buf + 100663296, 24);
+    helper_double_27_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_27(double *buf);
+void helper_double_27(double *buf) {
+  helper_double_27_recursive(buf, 27);
+}
+void helper_double_28_recursive(double *buf, int depth);
+void helper_double_28_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_28_recursive(buf + 0, 11);
+    helper_double_28_recursive(buf + 2048, 11);
+    helper_double_28_recursive(buf + 4096, 11);
+    helper_double_28_recursive(buf + 6144, 11);
+    helper_double_28_recursive(buf + 8192, 11);
+    helper_double_28_recursive(buf + 10240, 11);
+    helper_double_28_recursive(buf + 12288, 11);
+    helper_double_28_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_28_recursive(buf + 0, 14);
+    helper_double_28_recursive(buf + 16384, 14);
+    helper_double_28_recursive(buf + 32768, 14);
+    helper_double_28_recursive(buf + 49152, 14);
+    helper_double_28_recursive(buf + 65536, 14);
+    helper_double_28_recursive(buf + 81920, 14);
+    helper_double_28_recursive(buf + 98304, 14);
+    helper_double_28_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_28_recursive(buf + 0, 17);
+    helper_double_28_recursive(buf + 131072, 17);
+    helper_double_28_recursive(buf + 262144, 17);
+    helper_double_28_recursive(buf + 393216, 17);
+    helper_double_28_recursive(buf + 524288, 17);
+    helper_double_28_recursive(buf + 655360, 17);
+    helper_double_28_recursive(buf + 786432, 17);
+    helper_double_28_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_28_recursive(buf + 0, 20);
+    helper_double_28_recursive(buf + 1048576, 20);
+    helper_double_28_recursive(buf + 2097152, 20);
+    helper_double_28_recursive(buf + 3145728, 20);
+    helper_double_28_recursive(buf + 4194304, 20);
+    helper_double_28_recursive(buf + 5242880, 20);
+    helper_double_28_recursive(buf + 6291456, 20);
+    helper_double_28_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_double_28_recursive(buf + 0, 23);
+    helper_double_28_recursive(buf + 8388608, 23);
+    helper_double_28_recursive(buf + 16777216, 23);
+    helper_double_28_recursive(buf + 25165824, 23);
+    helper_double_28_recursive(buf + 33554432, 23);
+    helper_double_28_recursive(buf + 41943040, 23);
+    helper_double_28_recursive(buf + 50331648, 23);
+    helper_double_28_recursive(buf + 58720256, 23);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 28) {
+    helper_double_28_recursive(buf + 0, 26);
+    helper_double_28_recursive(buf + 67108864, 26);
+    helper_double_28_recursive(buf + 134217728, 26);
+    helper_double_28_recursive(buf + 201326592, 26);
+    for (int j = 0; j < 268435456; j += 268435456) {
+      for (int k = 0; k < 67108864; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_28(double *buf);
+void helper_double_28(double *buf) {
+  helper_double_28_recursive(buf, 28);
+}
+void helper_double_29_recursive(double *buf, int depth);
+void helper_double_29_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_29_recursive(buf + 0, 11);
+    helper_double_29_recursive(buf + 2048, 11);
+    helper_double_29_recursive(buf + 4096, 11);
+    helper_double_29_recursive(buf + 6144, 11);
+    helper_double_29_recursive(buf + 8192, 11);
+    helper_double_29_recursive(buf + 10240, 11);
+    helper_double_29_recursive(buf + 12288, 11);
+    helper_double_29_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_29_recursive(buf + 0, 14);
+    helper_double_29_recursive(buf + 16384, 14);
+    helper_double_29_recursive(buf + 32768, 14);
+    helper_double_29_recursive(buf + 49152, 14);
+    helper_double_29_recursive(buf + 65536, 14);
+    helper_double_29_recursive(buf + 81920, 14);
+    helper_double_29_recursive(buf + 98304, 14);
+    helper_double_29_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_29_recursive(buf + 0, 17);
+    helper_double_29_recursive(buf + 131072, 17);
+    helper_double_29_recursive(buf + 262144, 17);
+    helper_double_29_recursive(buf + 393216, 17);
+    helper_double_29_recursive(buf + 524288, 17);
+    helper_double_29_recursive(buf + 655360, 17);
+    helper_double_29_recursive(buf + 786432, 17);
+    helper_double_29_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_29_recursive(buf + 0, 20);
+    helper_double_29_recursive(buf + 1048576, 20);
+    helper_double_29_recursive(buf + 2097152, 20);
+    helper_double_29_recursive(buf + 3145728, 20);
+    helper_double_29_recursive(buf + 4194304, 20);
+    helper_double_29_recursive(buf + 5242880, 20);
+    helper_double_29_recursive(buf + 6291456, 20);
+    helper_double_29_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_double_29_recursive(buf + 0, 23);
+    helper_double_29_recursive(buf + 8388608, 23);
+    helper_double_29_recursive(buf + 16777216, 23);
+    helper_double_29_recursive(buf + 25165824, 23);
+    helper_double_29_recursive(buf + 33554432, 23);
+    helper_double_29_recursive(buf + 41943040, 23);
+    helper_double_29_recursive(buf + 50331648, 23);
+    helper_double_29_recursive(buf + 58720256, 23);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 29) {
+    helper_double_29_recursive(buf + 0, 26);
+    helper_double_29_recursive(buf + 67108864, 26);
+    helper_double_29_recursive(buf + 134217728, 26);
+    helper_double_29_recursive(buf + 201326592, 26);
+    helper_double_29_recursive(buf + 268435456, 26);
+    helper_double_29_recursive(buf + 335544320, 26);
+    helper_double_29_recursive(buf + 402653184, 26);
+    helper_double_29_recursive(buf + 469762048, 26);
+    for (int j = 0; j < 536870912; j += 536870912) {
+      for (int k = 0; k < 67108864; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592), "r"(buf + j + k + 268435456), "r"(buf + j + k + 335544320), "r"(buf + j + k + 402653184), "r"(buf + j + k + 469762048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_29(double *buf);
+void helper_double_29(double *buf) {
+  helper_double_29_recursive(buf, 29);
+}
+void helper_double_30_recursive(double *buf, int depth);
+void helper_double_30_recursive(double *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_30_recursive(buf + 0, 9);
+    helper_double_30_recursive(buf + 512, 9);
+    helper_double_30_recursive(buf + 1024, 9);
+    helper_double_30_recursive(buf + 1536, 9);
+    helper_double_30_recursive(buf + 2048, 9);
+    helper_double_30_recursive(buf + 2560, 9);
+    helper_double_30_recursive(buf + 3072, 9);
+    helper_double_30_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_30_recursive(buf + 0, 12);
+    helper_double_30_recursive(buf + 4096, 12);
+    helper_double_30_recursive(buf + 8192, 12);
+    helper_double_30_recursive(buf + 12288, 12);
+    helper_double_30_recursive(buf + 16384, 12);
+    helper_double_30_recursive(buf + 20480, 12);
+    helper_double_30_recursive(buf + 24576, 12);
+    helper_double_30_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_30_recursive(buf + 0, 15);
+    helper_double_30_recursive(buf + 32768, 15);
+    helper_double_30_recursive(buf + 65536, 15);
+    helper_double_30_recursive(buf + 98304, 15);
+    helper_double_30_recursive(buf + 131072, 15);
+    helper_double_30_recursive(buf + 163840, 15);
+    helper_double_30_recursive(buf + 196608, 15);
+    helper_double_30_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_30_recursive(buf + 0, 18);
+    helper_double_30_recursive(buf + 262144, 18);
+    helper_double_30_recursive(buf + 524288, 18);
+    helper_double_30_recursive(buf + 786432, 18);
+    helper_double_30_recursive(buf + 1048576, 18);
+    helper_double_30_recursive(buf + 1310720, 18);
+    helper_double_30_recursive(buf + 1572864, 18);
+    helper_double_30_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_30_recursive(buf + 0, 21);
+    helper_double_30_recursive(buf + 2097152, 21);
+    helper_double_30_recursive(buf + 4194304, 21);
+    helper_double_30_recursive(buf + 6291456, 21);
+    helper_double_30_recursive(buf + 8388608, 21);
+    helper_double_30_recursive(buf + 10485760, 21);
+    helper_double_30_recursive(buf + 12582912, 21);
+    helper_double_30_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_double_30_recursive(buf + 0, 24);
+    helper_double_30_recursive(buf + 16777216, 24);
+    helper_double_30_recursive(buf + 33554432, 24);
+    helper_double_30_recursive(buf + 50331648, 24);
+    helper_double_30_recursive(buf + 67108864, 24);
+    helper_double_30_recursive(buf + 83886080, 24);
+    helper_double_30_recursive(buf + 100663296, 24);
+    helper_double_30_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 30) {
+    helper_double_30_recursive(buf + 0, 27);
+    helper_double_30_recursive(buf + 134217728, 27);
+    helper_double_30_recursive(buf + 268435456, 27);
+    helper_double_30_recursive(buf + 402653184, 27);
+    helper_double_30_recursive(buf + 536870912, 27);
+    helper_double_30_recursive(buf + 671088640, 27);
+    helper_double_30_recursive(buf + 805306368, 27);
+    helper_double_30_recursive(buf + 939524096, 27);
+    for (int j = 0; j < 1073741824; j += 1073741824) {
+      for (int k = 0; k < 134217728; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_30(double *buf);
+void helper_double_30(double *buf) {
+  helper_double_30_recursive(buf, 30);
+}
+int fht_double(double *buf, int log_n) {
+  if (log_n == 0) {
+    return 0;
+  }
+  if (log_n == 1) {
+    helper_double_1(buf);
+    return 0;
+  }
+  if (log_n == 2) {
+    helper_double_2(buf);
+    return 0;
+  }
+  if (log_n == 3) {
+    helper_double_3(buf);
+    return 0;
+  }
+  if (log_n == 4) {
+    helper_double_4(buf);
+    return 0;
+  }
+  if (log_n == 5) {
+    helper_double_5(buf);
+    return 0;
+  }
+  if (log_n == 6) {
+    helper_double_6(buf);
+    return 0;
+  }
+  if (log_n == 7) {
+    helper_double_7(buf);
+    return 0;
+  }
+  if (log_n == 8) {
+    helper_double_8(buf);
+    return 0;
+  }
+  if (log_n == 9) {
+    helper_double_9(buf);
+    return 0;
+  }
+  if (log_n == 10) {
+    helper_double_10(buf);
+    return 0;
+  }
+  if (log_n == 11) {
+    helper_double_11(buf);
+    return 0;
+  }
+  if (log_n == 12) {
+    helper_double_12(buf);
+    return 0;
+  }
+  if (log_n == 13) {
+    helper_double_13(buf);
+    return 0;
+  }
+  if (log_n == 14) {
+    helper_double_14(buf);
+    return 0;
+  }
+  if (log_n == 15) {
+    helper_double_15(buf);
+    return 0;
+  }
+  if (log_n == 16) {
+    helper_double_16(buf);
+    return 0;
+  }
+  if (log_n == 17) {
+    helper_double_17(buf);
+    return 0;
+  }
+  if (log_n == 18) {
+    helper_double_18(buf);
+    return 0;
+  }
+  if (log_n == 19) {
+    helper_double_19(buf);
+    return 0;
+  }
+  if (log_n == 20) {
+    helper_double_20(buf);
+    return 0;
+  }
+  if (log_n == 21) {
+    helper_double_21(buf);
+    return 0;
+  }
+  if (log_n == 22) {
+    helper_double_22(buf);
+    return 0;
+  }
+  if (log_n == 23) {
+    helper_double_23(buf);
+    return 0;
+  }
+  if (log_n == 24) {
+    helper_double_24(buf);
+    return 0;
+  }
+  if (log_n == 25) {
+    helper_double_25(buf);
+    return 0;
+  }
+  if (log_n == 26) {
+    helper_double_26(buf);
+    return 0;
+  }
+  if (log_n == 27) {
+    helper_double_27(buf);
+    return 0;
+  }
+  if (log_n == 28) {
+    helper_double_28(buf);
+    return 0;
+  }
+  if (log_n == 29) {
+    helper_double_29(buf);
+    return 0;
+  }
+  if (log_n == 30) {
+    helper_double_30(buf);
+    return 0;
+  }
+  return 1;
+}
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h
new file mode 100644
index 00000000000..13ec1086500
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h
@@ -0,0 +1,39 @@
+#ifndef _FHT_IMPL_H__
+#define _FHT_IMPL_H__
+
+#include "fast_copy.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __aarch64__
+#include "fht_neon.c"
+#define VECTOR_WIDTH (16u)
+#else
+#ifdef __AVX__
+#include "fht_avx.c"
+#define VECTOR_WIDTH (32u)
+#else
+#include "fht_sse.c"
+#define VECTOR_WIDTH (16u)
+#endif
+#endif
+
+int fht_float_oop(float* in, float* out, int log_n) {
+  fast_copy(out, in, sizeof(float) << log_n);
+  return fht_float(out, log_n);
+}
+
+#ifndef __aarch64__
+int fht_double_oop(double* in, double* out, int log_n) {
+  fast_copy(out, in, sizeof(double) << log_n);
+  return fht_double(out, log_n);
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // ifndef _FHT_IMPL_H__
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_neon.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_neon.c
new file mode 100644
index 00000000000..3d84ee96195
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_neon.c
@@ -0,0 +1,3019 @@
+// @generated
+#include "fht.h"
+static inline void helper_float_1(float* buf);
+static inline void helper_float_1(float* buf) {
+  for (int j = 0; j < 2; j += 2) {
+    for (int k = 0; k < 1; ++k) {
+      float u = buf[j + k];
+      float v = buf[j + k + 1];
+      buf[j + k] = u + v;
+      buf[j + k + 1] = u - v;
+    }
+  }
+}
+static inline void helper_float_2(float* buf);
+static inline void helper_float_2(float* buf) {
+  for (int j = 0; j < 4; j += 4) {
+    __asm__ volatile(
+        "LD1 {v0.4S}, [%0]\n"
+        "TRN1 v16.4S, v0.4S, v0.4S\n"
+        "FNEG v17.4S, v0.4S\n"
+        "TRN2 v17.4S, v0.4S, v17.4S\n"
+        "FADD v0.4S, v16.4S, v17.4S\n"
+        "DUP v16.2D, v0.D[0]\n"
+        "FNEG v17.4S, v0.4S\n"
+        "INS v17.D[0], v0.D[1]\n"
+        "FADD v0.4S, v16.4S, v17.4S\n"
+        "ST1 {v0.4S}, [%0]\n" ::"r"(buf + j)
+        : "%v0",
+          "%v1",
+          "%v2",
+          "%v3",
+          "%v4",
+          "%v5",
+          "%v6",
+          "%v7",
+          "%v8",
+          "%v9",
+          "%v10",
+          "%v11",
+          "%v12",
+          "%v13",
+          "%v14",
+          "%v15",
+          "%v16",
+          "%v17",
+          "%v18",
+          "%v19",
+          "%v20",
+          "%v21",
+          "%v22",
+          "%v23",
+          "%v24",
+          "%v25",
+          "%v26",
+          "%v27",
+          "%v28",
+          "%v29",
+          "%v30",
+          "%v31",
+          "memory");
+  }
+}
+void helper_float_3_recursive(float* buf, int depth);
+void helper_float_3_recursive(float* buf, int depth) {
+  if (depth == 2) {
+    helper_float_2(buf);
+    return;
+  }
+  if (depth == 3) {
+    helper_float_3_recursive(buf + 0, 2);
+    helper_float_3_recursive(buf + 4, 2);
+    for (int j = 0; j < 8; j += 8) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 4)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_3(float* buf);
+void helper_float_3(float* buf) {
+  helper_float_3_recursive(buf, 3);
+}
+void helper_float_4_recursive(float* buf, int depth);
+void helper_float_4_recursive(float* buf, int depth) {
+  if (depth == 3) {
+    helper_float_3(buf);
+    return;
+  }
+  if (depth == 4) {
+    helper_float_4_recursive(buf + 0, 3);
+    helper_float_4_recursive(buf + 8, 3);
+    for (int j = 0; j < 16; j += 16) {
+      for (int k = 0; k < 8; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 8)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_4(float* buf);
+void helper_float_4(float* buf) {
+  helper_float_4_recursive(buf, 4);
+}
+void helper_float_5_recursive(float* buf, int depth);
+void helper_float_5_recursive(float* buf, int depth) {
+  if (depth == 4) {
+    helper_float_4(buf);
+    return;
+  }
+  if (depth == 5) {
+    helper_float_5_recursive(buf + 0, 4);
+    helper_float_5_recursive(buf + 16, 4);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 16; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 16)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_5(float* buf);
+void helper_float_5(float* buf) {
+  helper_float_5_recursive(buf, 5);
+}
+void helper_float_6_recursive(float* buf, int depth);
+void helper_float_6_recursive(float* buf, int depth) {
+  if (depth == 3) {
+    helper_float_3(buf);
+    return;
+  }
+  if (depth == 6) {
+    helper_float_6_recursive(buf + 0, 3);
+    helper_float_6_recursive(buf + 8, 3);
+    helper_float_6_recursive(buf + 16, 3);
+    helper_float_6_recursive(buf + 24, 3);
+    helper_float_6_recursive(buf + 32, 3);
+    helper_float_6_recursive(buf + 40, 3);
+    helper_float_6_recursive(buf + 48, 3);
+    helper_float_6_recursive(buf + 56, 3);
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 8; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "LD1 {v4.4S}, [%4]\n"
+            "LD1 {v5.4S}, [%5]\n"
+            "LD1 {v6.4S}, [%6]\n"
+            "LD1 {v7.4S}, [%7]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v20.4S, v4.4S, v5.4S\n"
+            "FSUB v21.4S, v4.4S, v5.4S\n"
+            "FADD v22.4S, v6.4S, v7.4S\n"
+            "FSUB v23.4S, v6.4S, v7.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "FADD v4.4S, v20.4S, v22.4S\n"
+            "FSUB v6.4S, v20.4S, v22.4S\n"
+            "FADD v5.4S, v21.4S, v23.4S\n"
+            "FSUB v7.4S, v21.4S, v23.4S\n"
+            "FADD v16.4S, v0.4S, v4.4S\n"
+            "FSUB v20.4S, v0.4S, v4.4S\n"
+            "FADD v17.4S, v1.4S, v5.4S\n"
+            "FSUB v21.4S, v1.4S, v5.4S\n"
+            "FADD v18.4S, v2.4S, v6.4S\n"
+            "FSUB v22.4S, v2.4S, v6.4S\n"
+            "FADD v19.4S, v3.4S, v7.4S\n"
+            "FSUB v23.4S, v3.4S, v7.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n"
+            "ST1 {v18.4S}, [%2]\n"
+            "ST1 {v19.4S}, [%3]\n"
+            "ST1 {v20.4S}, [%4]\n"
+            "ST1 {v21.4S}, [%5]\n"
+            "ST1 {v22.4S}, [%6]\n"
+            "ST1 {v23.4S}, [%7]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 8),
+            "r"(buf + j + k + 16),
+            "r"(buf + j + k + 24),
+            "r"(buf + j + k + 32),
+            "r"(buf + j + k + 40),
+            "r"(buf + j + k + 48),
+            "r"(buf + j + k + 56)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_6(float* buf);
+void helper_float_6(float* buf) {
+  helper_float_6_recursive(buf, 6);
+}
+void helper_float_7_recursive(float* buf, int depth);
+void helper_float_7_recursive(float* buf, int depth) {
+  if (depth == 3) {
+    helper_float_3(buf);
+    return;
+  }
+  if (depth == 7) {
+    helper_float_7_recursive(buf + 0, 3);
+    helper_float_7_recursive(buf + 8, 3);
+    helper_float_7_recursive(buf + 16, 3);
+    helper_float_7_recursive(buf + 24, 3);
+    helper_float_7_recursive(buf + 32, 3);
+    helper_float_7_recursive(buf + 40, 3);
+    helper_float_7_recursive(buf + 48, 3);
+    helper_float_7_recursive(buf + 56, 3);
+    helper_float_7_recursive(buf + 64, 3);
+    helper_float_7_recursive(buf + 72, 3);
+    helper_float_7_recursive(buf + 80, 3);
+    helper_float_7_recursive(buf + 88, 3);
+    helper_float_7_recursive(buf + 96, 3);
+    helper_float_7_recursive(buf + 104, 3);
+    helper_float_7_recursive(buf + 112, 3);
+    helper_float_7_recursive(buf + 120, 3);
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 8; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "LD1 {v4.4S}, [%4]\n"
+            "LD1 {v5.4S}, [%5]\n"
+            "LD1 {v6.4S}, [%6]\n"
+            "LD1 {v7.4S}, [%7]\n"
+            "LD1 {v8.4S}, [%8]\n"
+            "LD1 {v9.4S}, [%9]\n"
+            "LD1 {v10.4S}, [%10]\n"
+            "LD1 {v11.4S}, [%11]\n"
+            "LD1 {v12.4S}, [%12]\n"
+            "LD1 {v13.4S}, [%13]\n"
+            "LD1 {v14.4S}, [%14]\n"
+            "LD1 {v15.4S}, [%15]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v20.4S, v4.4S, v5.4S\n"
+            "FSUB v21.4S, v4.4S, v5.4S\n"
+            "FADD v22.4S, v6.4S, v7.4S\n"
+            "FSUB v23.4S, v6.4S, v7.4S\n"
+            "FADD v24.4S, v8.4S, v9.4S\n"
+            "FSUB v25.4S, v8.4S, v9.4S\n"
+            "FADD v26.4S, v10.4S, v11.4S\n"
+            "FSUB v27.4S, v10.4S, v11.4S\n"
+            "FADD v28.4S, v12.4S, v13.4S\n"
+            "FSUB v29.4S, v12.4S, v13.4S\n"
+            "FADD v30.4S, v14.4S, v15.4S\n"
+            "FSUB v31.4S, v14.4S, v15.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "FADD v4.4S, v20.4S, v22.4S\n"
+            "FSUB v6.4S, v20.4S, v22.4S\n"
+            "FADD v5.4S, v21.4S, v23.4S\n"
+            "FSUB v7.4S, v21.4S, v23.4S\n"
+            "FADD v8.4S, v24.4S, v26.4S\n"
+            "FSUB v10.4S, v24.4S, v26.4S\n"
+            "FADD v9.4S, v25.4S, v27.4S\n"
+            "FSUB v11.4S, v25.4S, v27.4S\n"
+            "FADD v12.4S, v28.4S, v30.4S\n"
+            "FSUB v14.4S, v28.4S, v30.4S\n"
+            "FADD v13.4S, v29.4S, v31.4S\n"
+            "FSUB v15.4S, v29.4S, v31.4S\n"
+            "FADD v16.4S, v0.4S, v4.4S\n"
+            "FSUB v20.4S, v0.4S, v4.4S\n"
+            "FADD v17.4S, v1.4S, v5.4S\n"
+            "FSUB v21.4S, v1.4S, v5.4S\n"
+            "FADD v18.4S, v2.4S, v6.4S\n"
+            "FSUB v22.4S, v2.4S, v6.4S\n"
+            "FADD v19.4S, v3.4S, v7.4S\n"
+            "FSUB v23.4S, v3.4S, v7.4S\n"
+            "FADD v24.4S, v8.4S, v12.4S\n"
+            "FSUB v28.4S, v8.4S, v12.4S\n"
+            "FADD v25.4S, v9.4S, v13.4S\n"
+            "FSUB v29.4S, v9.4S, v13.4S\n"
+            "FADD v26.4S, v10.4S, v14.4S\n"
+            "FSUB v30.4S, v10.4S, v14.4S\n"
+            "FADD v27.4S, v11.4S, v15.4S\n"
+            "FSUB v31.4S, v11.4S, v15.4S\n"
+            "FADD v0.4S, v16.4S, v24.4S\n"
+            "FSUB v8.4S, v16.4S, v24.4S\n"
+            "FADD v1.4S, v17.4S, v25.4S\n"
+            "FSUB v9.4S, v17.4S, v25.4S\n"
+            "FADD v2.4S, v18.4S, v26.4S\n"
+            "FSUB v10.4S, v18.4S, v26.4S\n"
+            "FADD v3.4S, v19.4S, v27.4S\n"
+            "FSUB v11.4S, v19.4S, v27.4S\n"
+            "FADD v4.4S, v20.4S, v28.4S\n"
+            "FSUB v12.4S, v20.4S, v28.4S\n"
+            "FADD v5.4S, v21.4S, v29.4S\n"
+            "FSUB v13.4S, v21.4S, v29.4S\n"
+            "FADD v6.4S, v22.4S, v30.4S\n"
+            "FSUB v14.4S, v22.4S, v30.4S\n"
+            "FADD v7.4S, v23.4S, v31.4S\n"
+            "FSUB v15.4S, v23.4S, v31.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n"
+            "ST1 {v4.4S}, [%4]\n"
+            "ST1 {v5.4S}, [%5]\n"
+            "ST1 {v6.4S}, [%6]\n"
+            "ST1 {v7.4S}, [%7]\n"
+            "ST1 {v8.4S}, [%8]\n"
+            "ST1 {v9.4S}, [%9]\n"
+            "ST1 {v10.4S}, [%10]\n"
+            "ST1 {v11.4S}, [%11]\n"
+            "ST1 {v12.4S}, [%12]\n"
+            "ST1 {v13.4S}, [%13]\n"
+            "ST1 {v14.4S}, [%14]\n"
+            "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 8),
+            "r"(buf + j + k + 16),
+            "r"(buf + j + k + 24),
+            "r"(buf + j + k + 32),
+            "r"(buf + j + k + 40),
+            "r"(buf + j + k + 48),
+            "r"(buf + j + k + 56),
+            "r"(buf + j + k + 64),
+            "r"(buf + j + k + 72),
+            "r"(buf + j + k + 80),
+            "r"(buf + j + k + 88),
+            "r"(buf + j + k + 96),
+            "r"(buf + j + k + 104),
+            "r"(buf + j + k + 112),
+            "r"(buf + j + k + 120)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_7(float* buf);
+void helper_float_7(float* buf) {
+  helper_float_7_recursive(buf, 7);
+}
+static inline void helper_float_8(float* buf);
+static inline void helper_float_8(float* buf) {
+  for (int j = 0; j < 256; j += 64) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile(
+          "LD1 {v0.4S}, [%0]\n"
+          "LD1 {v1.4S}, [%1]\n"
+          "LD1 {v2.4S}, [%2]\n"
+          "LD1 {v3.4S}, [%3]\n"
+          "LD1 {v4.4S}, [%4]\n"
+          "LD1 {v5.4S}, [%5]\n"
+          "LD1 {v6.4S}, [%6]\n"
+          "LD1 {v7.4S}, [%7]\n"
+          "LD1 {v8.4S}, [%8]\n"
+          "LD1 {v9.4S}, [%9]\n"
+          "LD1 {v10.4S}, [%10]\n"
+          "LD1 {v11.4S}, [%11]\n"
+          "LD1 {v12.4S}, [%12]\n"
+          "LD1 {v13.4S}, [%13]\n"
+          "LD1 {v14.4S}, [%14]\n"
+          "LD1 {v15.4S}, [%15]\n"
+          "TRN1 v16.4S, v0.4S, v0.4S\n"
+          "FNEG v17.4S, v0.4S\n"
+          "TRN2 v17.4S, v0.4S, v17.4S\n"
+          "FADD v0.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v1.4S, v1.4S\n"
+          "FNEG v17.4S, v1.4S\n"
+          "TRN2 v17.4S, v1.4S, v17.4S\n"
+          "FADD v1.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v2.4S, v2.4S\n"
+          "FNEG v17.4S, v2.4S\n"
+          "TRN2 v17.4S, v2.4S, v17.4S\n"
+          "FADD v2.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v3.4S, v3.4S\n"
+          "FNEG v17.4S, v3.4S\n"
+          "TRN2 v17.4S, v3.4S, v17.4S\n"
+          "FADD v3.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v4.4S, v4.4S\n"
+          "FNEG v17.4S, v4.4S\n"
+          "TRN2 v17.4S, v4.4S, v17.4S\n"
+          "FADD v4.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v5.4S, v5.4S\n"
+          "FNEG v17.4S, v5.4S\n"
+          "TRN2 v17.4S, v5.4S, v17.4S\n"
+          "FADD v5.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v6.4S, v6.4S\n"
+          "FNEG v17.4S, v6.4S\n"
+          "TRN2 v17.4S, v6.4S, v17.4S\n"
+          "FADD v6.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v7.4S, v7.4S\n"
+          "FNEG v17.4S, v7.4S\n"
+          "TRN2 v17.4S, v7.4S, v17.4S\n"
+          "FADD v7.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v8.4S, v8.4S\n"
+          "FNEG v17.4S, v8.4S\n"
+          "TRN2 v17.4S, v8.4S, v17.4S\n"
+          "FADD v8.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v9.4S, v9.4S\n"
+          "FNEG v17.4S, v9.4S\n"
+          "TRN2 v17.4S, v9.4S, v17.4S\n"
+          "FADD v9.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v10.4S, v10.4S\n"
+          "FNEG v17.4S, v10.4S\n"
+          "TRN2 v17.4S, v10.4S, v17.4S\n"
+          "FADD v10.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v11.4S, v11.4S\n"
+          "FNEG v17.4S, v11.4S\n"
+          "TRN2 v17.4S, v11.4S, v17.4S\n"
+          "FADD v11.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v12.4S, v12.4S\n"
+          "FNEG v17.4S, v12.4S\n"
+          "TRN2 v17.4S, v12.4S, v17.4S\n"
+          "FADD v12.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v13.4S, v13.4S\n"
+          "FNEG v17.4S, v13.4S\n"
+          "TRN2 v17.4S, v13.4S, v17.4S\n"
+          "FADD v13.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v14.4S, v14.4S\n"
+          "FNEG v17.4S, v14.4S\n"
+          "TRN2 v17.4S, v14.4S, v17.4S\n"
+          "FADD v14.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v15.4S, v15.4S\n"
+          "FNEG v17.4S, v15.4S\n"
+          "TRN2 v17.4S, v15.4S, v17.4S\n"
+          "FADD v15.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v0.D[0]\n"
+          "FNEG v17.4S, v0.4S\n"
+          "INS v17.D[0], v0.D[1]\n"
+          "FADD v0.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v1.D[0]\n"
+          "FNEG v17.4S, v1.4S\n"
+          "INS v17.D[0], v1.D[1]\n"
+          "FADD v1.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v2.D[0]\n"
+          "FNEG v17.4S, v2.4S\n"
+          "INS v17.D[0], v2.D[1]\n"
+          "FADD v2.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v3.D[0]\n"
+          "FNEG v17.4S, v3.4S\n"
+          "INS v17.D[0], v3.D[1]\n"
+          "FADD v3.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v4.D[0]\n"
+          "FNEG v17.4S, v4.4S\n"
+          "INS v17.D[0], v4.D[1]\n"
+          "FADD v4.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v5.D[0]\n"
+          "FNEG v17.4S, v5.4S\n"
+          "INS v17.D[0], v5.D[1]\n"
+          "FADD v5.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v6.D[0]\n"
+          "FNEG v17.4S, v6.4S\n"
+          "INS v17.D[0], v6.D[1]\n"
+          "FADD v6.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v7.D[0]\n"
+          "FNEG v17.4S, v7.4S\n"
+          "INS v17.D[0], v7.D[1]\n"
+          "FADD v7.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v8.D[0]\n"
+          "FNEG v17.4S, v8.4S\n"
+          "INS v17.D[0], v8.D[1]\n"
+          "FADD v8.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v9.D[0]\n"
+          "FNEG v17.4S, v9.4S\n"
+          "INS v17.D[0], v9.D[1]\n"
+          "FADD v9.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v10.D[0]\n"
+          "FNEG v17.4S, v10.4S\n"
+          "INS v17.D[0], v10.D[1]\n"
+          "FADD v10.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v11.D[0]\n"
+          "FNEG v17.4S, v11.4S\n"
+          "INS v17.D[0], v11.D[1]\n"
+          "FADD v11.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v12.D[0]\n"
+          "FNEG v17.4S, v12.4S\n"
+          "INS v17.D[0], v12.D[1]\n"
+          "FADD v12.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v13.D[0]\n"
+          "FNEG v17.4S, v13.4S\n"
+          "INS v17.D[0], v13.D[1]\n"
+          "FADD v13.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v14.D[0]\n"
+          "FNEG v17.4S, v14.4S\n"
+          "INS v17.D[0], v14.D[1]\n"
+          "FADD v14.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v15.D[0]\n"
+          "FNEG v17.4S, v15.4S\n"
+          "INS v17.D[0], v15.D[1]\n"
+          "FADD v15.4S, v16.4S, v17.4S\n"
+          "FADD v16.4S, v0.4S, v1.4S\n"
+          "FSUB v17.4S, v0.4S, v1.4S\n"
+          "FADD v18.4S, v2.4S, v3.4S\n"
+          "FSUB v19.4S, v2.4S, v3.4S\n"
+          "FADD v20.4S, v4.4S, v5.4S\n"
+          "FSUB v21.4S, v4.4S, v5.4S\n"
+          "FADD v22.4S, v6.4S, v7.4S\n"
+          "FSUB v23.4S, v6.4S, v7.4S\n"
+          "FADD v24.4S, v8.4S, v9.4S\n"
+          "FSUB v25.4S, v8.4S, v9.4S\n"
+          "FADD v26.4S, v10.4S, v11.4S\n"
+          "FSUB v27.4S, v10.4S, v11.4S\n"
+          "FADD v28.4S, v12.4S, v13.4S\n"
+          "FSUB v29.4S, v12.4S, v13.4S\n"
+          "FADD v30.4S, v14.4S, v15.4S\n"
+          "FSUB v31.4S, v14.4S, v15.4S\n"
+          "FADD v0.4S, v16.4S, v18.4S\n"
+          "FSUB v2.4S, v16.4S, v18.4S\n"
+          "FADD v1.4S, v17.4S, v19.4S\n"
+          "FSUB v3.4S, v17.4S, v19.4S\n"
+          "FADD v4.4S, v20.4S, v22.4S\n"
+          "FSUB v6.4S, v20.4S, v22.4S\n"
+          "FADD v5.4S, v21.4S, v23.4S\n"
+          "FSUB v7.4S, v21.4S, v23.4S\n"
+          "FADD v8.4S, v24.4S, v26.4S\n"
+          "FSUB v10.4S, v24.4S, v26.4S\n"
+          "FADD v9.4S, v25.4S, v27.4S\n"
+          "FSUB v11.4S, v25.4S, v27.4S\n"
+          "FADD v12.4S, v28.4S, v30.4S\n"
+          "FSUB v14.4S, v28.4S, v30.4S\n"
+          "FADD v13.4S, v29.4S, v31.4S\n"
+          "FSUB v15.4S, v29.4S, v31.4S\n"
+          "FADD v16.4S, v0.4S, v4.4S\n"
+          "FSUB v20.4S, v0.4S, v4.4S\n"
+          "FADD v17.4S, v1.4S, v5.4S\n"
+          "FSUB v21.4S, v1.4S, v5.4S\n"
+          "FADD v18.4S, v2.4S, v6.4S\n"
+          "FSUB v22.4S, v2.4S, v6.4S\n"
+          "FADD v19.4S, v3.4S, v7.4S\n"
+          "FSUB v23.4S, v3.4S, v7.4S\n"
+          "FADD v24.4S, v8.4S, v12.4S\n"
+          "FSUB v28.4S, v8.4S, v12.4S\n"
+          "FADD v25.4S, v9.4S, v13.4S\n"
+          "FSUB v29.4S, v9.4S, v13.4S\n"
+          "FADD v26.4S, v10.4S, v14.4S\n"
+          "FSUB v30.4S, v10.4S, v14.4S\n"
+          "FADD v27.4S, v11.4S, v15.4S\n"
+          "FSUB v31.4S, v11.4S, v15.4S\n"
+          "FADD v0.4S, v16.4S, v24.4S\n"
+          "FSUB v8.4S, v16.4S, v24.4S\n"
+          "FADD v1.4S, v17.4S, v25.4S\n"
+          "FSUB v9.4S, v17.4S, v25.4S\n"
+          "FADD v2.4S, v18.4S, v26.4S\n"
+          "FSUB v10.4S, v18.4S, v26.4S\n"
+          "FADD v3.4S, v19.4S, v27.4S\n"
+          "FSUB v11.4S, v19.4S, v27.4S\n"
+          "FADD v4.4S, v20.4S, v28.4S\n"
+          "FSUB v12.4S, v20.4S, v28.4S\n"
+          "FADD v5.4S, v21.4S, v29.4S\n"
+          "FSUB v13.4S, v21.4S, v29.4S\n"
+          "FADD v6.4S, v22.4S, v30.4S\n"
+          "FSUB v14.4S, v22.4S, v30.4S\n"
+          "FADD v7.4S, v23.4S, v31.4S\n"
+          "FSUB v15.4S, v23.4S, v31.4S\n"
+          "ST1 {v0.4S}, [%0]\n"
+          "ST1 {v1.4S}, [%1]\n"
+          "ST1 {v2.4S}, [%2]\n"
+          "ST1 {v3.4S}, [%3]\n"
+          "ST1 {v4.4S}, [%4]\n"
+          "ST1 {v5.4S}, [%5]\n"
+          "ST1 {v6.4S}, [%6]\n"
+          "ST1 {v7.4S}, [%7]\n"
+          "ST1 {v8.4S}, [%8]\n"
+          "ST1 {v9.4S}, [%9]\n"
+          "ST1 {v10.4S}, [%10]\n"
+          "ST1 {v11.4S}, [%11]\n"
+          "ST1 {v12.4S}, [%12]\n"
+          "ST1 {v13.4S}, [%13]\n"
+          "ST1 {v14.4S}, [%14]\n"
+          "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
+          "r"(buf + j + k + 4),
+          "r"(buf + j + k + 8),
+          "r"(buf + j + k + 12),
+          "r"(buf + j + k + 16),
+          "r"(buf + j + k + 20),
+          "r"(buf + j + k + 24),
+          "r"(buf + j + k + 28),
+          "r"(buf + j + k + 32),
+          "r"(buf + j + k + 36),
+          "r"(buf + j + k + 40),
+          "r"(buf + j + k + 44),
+          "r"(buf + j + k + 48),
+          "r"(buf + j + k + 52),
+          "r"(buf + j + k + 56),
+          "r"(buf + j + k + 60)
+          : "%v0",
+            "%v1",
+            "%v2",
+            "%v3",
+            "%v4",
+            "%v5",
+            "%v6",
+            "%v7",
+            "%v8",
+            "%v9",
+            "%v10",
+            "%v11",
+            "%v12",
+            "%v13",
+            "%v14",
+            "%v15",
+            "%v16",
+            "%v17",
+            "%v18",
+            "%v19",
+            "%v20",
+            "%v21",
+            "%v22",
+            "%v23",
+            "%v24",
+            "%v25",
+            "%v26",
+            "%v27",
+            "%v28",
+            "%v29",
+            "%v30",
+            "%v31",
+            "memory");
+    }
+  }
+  for (int j = 0; j < 256; j += 256) {
+    for (int k = 0; k < 64; k += 4) {
+      __asm__ volatile(
+          "LD1 {v0.4S}, [%0]\n"
+          "LD1 {v1.4S}, [%1]\n"
+          "LD1 {v2.4S}, [%2]\n"
+          "LD1 {v3.4S}, [%3]\n"
+          "FADD v16.4S, v0.4S, v1.4S\n"
+          "FSUB v17.4S, v0.4S, v1.4S\n"
+          "FADD v18.4S, v2.4S, v3.4S\n"
+          "FSUB v19.4S, v2.4S, v3.4S\n"
+          "FADD v0.4S, v16.4S, v18.4S\n"
+          "FSUB v2.4S, v16.4S, v18.4S\n"
+          "FADD v1.4S, v17.4S, v19.4S\n"
+          "FSUB v3.4S, v17.4S, v19.4S\n"
+          "ST1 {v0.4S}, [%0]\n"
+          "ST1 {v1.4S}, [%1]\n"
+          "ST1 {v2.4S}, [%2]\n"
+          "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+          "r"(buf + j + k + 64),
+          "r"(buf + j + k + 128),
+          "r"(buf + j + k + 192)
+          : "%v0",
+            "%v1",
+            "%v2",
+            "%v3",
+            "%v4",
+            "%v5",
+            "%v6",
+            "%v7",
+            "%v8",
+            "%v9",
+            "%v10",
+            "%v11",
+            "%v12",
+            "%v13",
+            "%v14",
+            "%v15",
+            "%v16",
+            "%v17",
+            "%v18",
+            "%v19",
+            "%v20",
+            "%v21",
+            "%v22",
+            "%v23",
+            "%v24",
+            "%v25",
+            "%v26",
+            "%v27",
+            "%v28",
+            "%v29",
+            "%v30",
+            "%v31",
+            "memory");
+    }
+  }
+}
+void helper_float_9_recursive(float* buf, int depth);
+void helper_float_9_recursive(float* buf, int depth) {
+  if (depth == 8) {
+    helper_float_8(buf);
+    return;
+  }
+  if (depth == 9) {
+    helper_float_9_recursive(buf + 0, 8);
+    helper_float_9_recursive(buf + 256, 8);
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 256)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_9(float* buf);
+void helper_float_9(float* buf) {
+  helper_float_9_recursive(buf, 9);
+}
+void helper_float_10_recursive(float* buf, int depth);
+void helper_float_10_recursive(float* buf, int depth) {
+  if (depth == 8) {
+    helper_float_8(buf);
+    return;
+  }
+  if (depth == 10) {
+    helper_float_10_recursive(buf + 0, 8);
+    helper_float_10_recursive(buf + 256, 8);
+    helper_float_10_recursive(buf + 512, 8);
+    helper_float_10_recursive(buf + 768, 8);
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 256),
+            "r"(buf + j + k + 512),
+            "r"(buf + j + k + 768)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_10(float* buf);
+void helper_float_10(float* buf) {
+  helper_float_10_recursive(buf, 10);
+}
+void helper_float_11_recursive(float* buf, int depth);
+void helper_float_11_recursive(float* buf, int depth) {
+  if (depth == 10) {
+    helper_float_10(buf);
+    return;
+  }
+  if (depth == 11) {
+    helper_float_11_recursive(buf + 0, 10);
+    helper_float_11_recursive(buf + 1024, 10);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 1024; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 1024)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_11(float* buf);
+void helper_float_11(float* buf) {
+  helper_float_11_recursive(buf, 11);
+}
+void helper_float_12_recursive(float* buf, int depth);
+void helper_float_12_recursive(float* buf, int depth) {
+  if (depth == 10) {
+    helper_float_10(buf);
+    return;
+  }
+  if (depth == 12) {
+    helper_float_12_recursive(buf + 0, 10);
+    helper_float_12_recursive(buf + 1024, 10);
+    helper_float_12_recursive(buf + 2048, 10);
+    helper_float_12_recursive(buf + 3072, 10);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 1024; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 1024),
+            "r"(buf + j + k + 2048),
+            "r"(buf + j + k + 3072)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_12(float* buf);
+void helper_float_12(float* buf) {
+  helper_float_12_recursive(buf, 12);
+}
+static inline void helper_float_13(float* buf);
+static inline void helper_float_13(float* buf) {
+  for (int j = 0; j < 8192; j += 64) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile(
+          "LD1 {v0.4S}, [%0]\n"
+          "LD1 {v1.4S}, [%1]\n"
+          "LD1 {v2.4S}, [%2]\n"
+          "LD1 {v3.4S}, [%3]\n"
+          "LD1 {v4.4S}, [%4]\n"
+          "LD1 {v5.4S}, [%5]\n"
+          "LD1 {v6.4S}, [%6]\n"
+          "LD1 {v7.4S}, [%7]\n"
+          "LD1 {v8.4S}, [%8]\n"
+          "LD1 {v9.4S}, [%9]\n"
+          "LD1 {v10.4S}, [%10]\n"
+          "LD1 {v11.4S}, [%11]\n"
+          "LD1 {v12.4S}, [%12]\n"
+          "LD1 {v13.4S}, [%13]\n"
+          "LD1 {v14.4S}, [%14]\n"
+          "LD1 {v15.4S}, [%15]\n"
+          "TRN1 v16.4S, v0.4S, v0.4S\n"
+          "FNEG v17.4S, v0.4S\n"
+          "TRN2 v17.4S, v0.4S, v17.4S\n"
+          "FADD v0.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v1.4S, v1.4S\n"
+          "FNEG v17.4S, v1.4S\n"
+          "TRN2 v17.4S, v1.4S, v17.4S\n"
+          "FADD v1.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v2.4S, v2.4S\n"
+          "FNEG v17.4S, v2.4S\n"
+          "TRN2 v17.4S, v2.4S, v17.4S\n"
+          "FADD v2.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v3.4S, v3.4S\n"
+          "FNEG v17.4S, v3.4S\n"
+          "TRN2 v17.4S, v3.4S, v17.4S\n"
+          "FADD v3.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v4.4S, v4.4S\n"
+          "FNEG v17.4S, v4.4S\n"
+          "TRN2 v17.4S, v4.4S, v17.4S\n"
+          "FADD v4.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v5.4S, v5.4S\n"
+          "FNEG v17.4S, v5.4S\n"
+          "TRN2 v17.4S, v5.4S, v17.4S\n"
+          "FADD v5.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v6.4S, v6.4S\n"
+          "FNEG v17.4S, v6.4S\n"
+          "TRN2 v17.4S, v6.4S, v17.4S\n"
+          "FADD v6.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v7.4S, v7.4S\n"
+          "FNEG v17.4S, v7.4S\n"
+          "TRN2 v17.4S, v7.4S, v17.4S\n"
+          "FADD v7.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v8.4S, v8.4S\n"
+          "FNEG v17.4S, v8.4S\n"
+          "TRN2 v17.4S, v8.4S, v17.4S\n"
+          "FADD v8.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v9.4S, v9.4S\n"
+          "FNEG v17.4S, v9.4S\n"
+          "TRN2 v17.4S, v9.4S, v17.4S\n"
+          "FADD v9.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v10.4S, v10.4S\n"
+          "FNEG v17.4S, v10.4S\n"
+          "TRN2 v17.4S, v10.4S, v17.4S\n"
+          "FADD v10.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v11.4S, v11.4S\n"
+          "FNEG v17.4S, v11.4S\n"
+          "TRN2 v17.4S, v11.4S, v17.4S\n"
+          "FADD v11.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v12.4S, v12.4S\n"
+          "FNEG v17.4S, v12.4S\n"
+          "TRN2 v17.4S, v12.4S, v17.4S\n"
+          "FADD v12.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v13.4S, v13.4S\n"
+          "FNEG v17.4S, v13.4S\n"
+          "TRN2 v17.4S, v13.4S, v17.4S\n"
+          "FADD v13.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v14.4S, v14.4S\n"
+          "FNEG v17.4S, v14.4S\n"
+          "TRN2 v17.4S, v14.4S, v17.4S\n"
+          "FADD v14.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v15.4S, v15.4S\n"
+          "FNEG v17.4S, v15.4S\n"
+          "TRN2 v17.4S, v15.4S, v17.4S\n"
+          "FADD v15.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v0.D[0]\n"
+          "FNEG v17.4S, v0.4S\n"
+          "INS v17.D[0], v0.D[1]\n"
+          "FADD v0.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v1.D[0]\n"
+          "FNEG v17.4S, v1.4S\n"
+          "INS v17.D[0], v1.D[1]\n"
+          "FADD v1.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v2.D[0]\n"
+          "FNEG v17.4S, v2.4S\n"
+          "INS v17.D[0], v2.D[1]\n"
+          "FADD v2.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v3.D[0]\n"
+          "FNEG v17.4S, v3.4S\n"
+          "INS v17.D[0], v3.D[1]\n"
+          "FADD v3.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v4.D[0]\n"
+          "FNEG v17.4S, v4.4S\n"
+          "INS v17.D[0], v4.D[1]\n"
+          "FADD v4.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v5.D[0]\n"
+          "FNEG v17.4S, v5.4S\n"
+          "INS v17.D[0], v5.D[1]\n"
+          "FADD v5.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v6.D[0]\n"
+          "FNEG v17.4S, v6.4S\n"
+          "INS v17.D[0], v6.D[1]\n"
+          "FADD v6.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v7.D[0]\n"
+          "FNEG v17.4S, v7.4S\n"
+          "INS v17.D[0], v7.D[1]\n"
+          "FADD v7.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v8.D[0]\n"
+          "FNEG v17.4S, v8.4S\n"
+          "INS v17.D[0], v8.D[1]\n"
+          "FADD v8.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v9.D[0]\n"
+          "FNEG v17.4S, v9.4S\n"
+          "INS v17.D[0], v9.D[1]\n"
+          "FADD v9.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v10.D[0]\n"
+          "FNEG v17.4S, v10.4S\n"
+          "INS v17.D[0], v10.D[1]\n"
+          "FADD v10.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v11.D[0]\n"
+          "FNEG v17.4S, v11.4S\n"
+          "INS v17.D[0], v11.D[1]\n"
+          "FADD v11.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v12.D[0]\n"
+          "FNEG v17.4S, v12.4S\n"
+          "INS v17.D[0], v12.D[1]\n"
+          "FADD v12.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v13.D[0]\n"
+          "FNEG v17.4S, v13.4S\n"
+          "INS v17.D[0], v13.D[1]\n"
+          "FADD v13.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v14.D[0]\n"
+          "FNEG v17.4S, v14.4S\n"
+          "INS v17.D[0], v14.D[1]\n"
+          "FADD v14.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v15.D[0]\n"
+          "FNEG v17.4S, v15.4S\n"
+          "INS v17.D[0], v15.D[1]\n"
+          "FADD v15.4S, v16.4S, v17.4S\n"
+          "FADD v16.4S, v0.4S, v1.4S\n"
+          "FSUB v17.4S, v0.4S, v1.4S\n"
+          "FADD v18.4S, v2.4S, v3.4S\n"
+          "FSUB v19.4S, v2.4S, v3.4S\n"
+          "FADD v20.4S, v4.4S, v5.4S\n"
+          "FSUB v21.4S, v4.4S, v5.4S\n"
+          "FADD v22.4S, v6.4S, v7.4S\n"
+          "FSUB v23.4S, v6.4S, v7.4S\n"
+          "FADD v24.4S, v8.4S, v9.4S\n"
+          "FSUB v25.4S, v8.4S, v9.4S\n"
+          "FADD v26.4S, v10.4S, v11.4S\n"
+          "FSUB v27.4S, v10.4S, v11.4S\n"
+          "FADD v28.4S, v12.4S, v13.4S\n"
+          "FSUB v29.4S, v12.4S, v13.4S\n"
+          "FADD v30.4S, v14.4S, v15.4S\n"
+          "FSUB v31.4S, v14.4S, v15.4S\n"
+          "FADD v0.4S, v16.4S, v18.4S\n"
+          "FSUB v2.4S, v16.4S, v18.4S\n"
+          "FADD v1.4S, v17.4S, v19.4S\n"
+          "FSUB v3.4S, v17.4S, v19.4S\n"
+          "FADD v4.4S, v20.4S, v22.4S\n"
+          "FSUB v6.4S, v20.4S, v22.4S\n"
+          "FADD v5.4S, v21.4S, v23.4S\n"
+          "FSUB v7.4S, v21.4S, v23.4S\n"
+          "FADD v8.4S, v24.4S, v26.4S\n"
+          "FSUB v10.4S, v24.4S, v26.4S\n"
+          "FADD v9.4S, v25.4S, v27.4S\n"
+          "FSUB v11.4S, v25.4S, v27.4S\n"
+          "FADD v12.4S, v28.4S, v30.4S\n"
+          "FSUB v14.4S, v28.4S, v30.4S\n"
+          "FADD v13.4S, v29.4S, v31.4S\n"
+          "FSUB v15.4S, v29.4S, v31.4S\n"
+          "FADD v16.4S, v0.4S, v4.4S\n"
+          "FSUB v20.4S, v0.4S, v4.4S\n"
+          "FADD v17.4S, v1.4S, v5.4S\n"
+          "FSUB v21.4S, v1.4S, v5.4S\n"
+          "FADD v18.4S, v2.4S, v6.4S\n"
+          "FSUB v22.4S, v2.4S, v6.4S\n"
+          "FADD v19.4S, v3.4S, v7.4S\n"
+          "FSUB v23.4S, v3.4S, v7.4S\n"
+          "FADD v24.4S, v8.4S, v12.4S\n"
+          "FSUB v28.4S, v8.4S, v12.4S\n"
+          "FADD v25.4S, v9.4S, v13.4S\n"
+          "FSUB v29.4S, v9.4S, v13.4S\n"
+          "FADD v26.4S, v10.4S, v14.4S\n"
+          "FSUB v30.4S, v10.4S, v14.4S\n"
+          "FADD v27.4S, v11.4S, v15.4S\n"
+          "FSUB v31.4S, v11.4S, v15.4S\n"
+          "FADD v0.4S, v16.4S, v24.4S\n"
+          "FSUB v8.4S, v16.4S, v24.4S\n"
+          "FADD v1.4S, v17.4S, v25.4S\n"
+          "FSUB v9.4S, v17.4S, v25.4S\n"
+          "FADD v2.4S, v18.4S, v26.4S\n"
+          "FSUB v10.4S, v18.4S, v26.4S\n"
+          "FADD v3.4S, v19.4S, v27.4S\n"
+          "FSUB v11.4S, v19.4S, v27.4S\n"
+          "FADD v4.4S, v20.4S, v28.4S\n"
+          "FSUB v12.4S, v20.4S, v28.4S\n"
+          "FADD v5.4S, v21.4S, v29.4S\n"
+          "FSUB v13.4S, v21.4S, v29.4S\n"
+          "FADD v6.4S, v22.4S, v30.4S\n"
+          "FSUB v14.4S, v22.4S, v30.4S\n"
+          "FADD v7.4S, v23.4S, v31.4S\n"
+          "FSUB v15.4S, v23.4S, v31.4S\n"
+          "ST1 {v0.4S}, [%0]\n"
+          "ST1 {v1.4S}, [%1]\n"
+          "ST1 {v2.4S}, [%2]\n"
+          "ST1 {v3.4S}, [%3]\n"
+          "ST1 {v4.4S}, [%4]\n"
+          "ST1 {v5.4S}, [%5]\n"
+          "ST1 {v6.4S}, [%6]\n"
+          "ST1 {v7.4S}, [%7]\n"
+          "ST1 {v8.4S}, [%8]\n"
+          "ST1 {v9.4S}, [%9]\n"
+          "ST1 {v10.4S}, [%10]\n"
+          "ST1 {v11.4S}, [%11]\n"
+          "ST1 {v12.4S}, [%12]\n"
+          "ST1 {v13.4S}, [%13]\n"
+          "ST1 {v14.4S}, [%14]\n"
+          "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
+          "r"(buf + j + k + 4),
+          "r"(buf + j + k + 8),
+          "r"(buf + j + k + 12),
+          "r"(buf + j + k + 16),
+          "r"(buf + j + k + 20),
+          "r"(buf + j + k + 24),
+          "r"(buf + j + k + 28),
+          "r"(buf + j + k + 32),
+          "r"(buf + j + k + 36),
+          "r"(buf + j + k + 40),
+          "r"(buf + j + k + 44),
+          "r"(buf + j + k + 48),
+          "r"(buf + j + k + 52),
+          "r"(buf + j + k + 56),
+          "r"(buf + j + k + 60)
+          : "%v0",
+            "%v1",
+            "%v2",
+            "%v3",
+            "%v4",
+            "%v5",
+            "%v6",
+            "%v7",
+            "%v8",
+            "%v9",
+            "%v10",
+            "%v11",
+            "%v12",
+            "%v13",
+            "%v14",
+            "%v15",
+            "%v16",
+            "%v17",
+            "%v18",
+            "%v19",
+            "%v20",
+            "%v21",
+            "%v22",
+            "%v23",
+            "%v24",
+            "%v25",
+            "%v26",
+            "%v27",
+            "%v28",
+            "%v29",
+            "%v30",
+            "%v31",
+            "memory");
+    }
+  }
+  for (int j = 0; j < 8192; j += 1024) {
+    for (int k = 0; k < 64; k += 4) {
+      __asm__ volatile(
+          "LD1 {v0.4S}, [%0]\n"
+          "LD1 {v1.4S}, [%1]\n"
+          "LD1 {v2.4S}, [%2]\n"
+          "LD1 {v3.4S}, [%3]\n"
+          "LD1 {v4.4S}, [%4]\n"
+          "LD1 {v5.4S}, [%5]\n"
+          "LD1 {v6.4S}, [%6]\n"
+          "LD1 {v7.4S}, [%7]\n"
+          "LD1 {v8.4S}, [%8]\n"
+          "LD1 {v9.4S}, [%9]\n"
+          "LD1 {v10.4S}, [%10]\n"
+          "LD1 {v11.4S}, [%11]\n"
+          "LD1 {v12.4S}, [%12]\n"
+          "LD1 {v13.4S}, [%13]\n"
+          "LD1 {v14.4S}, [%14]\n"
+          "LD1 {v15.4S}, [%15]\n"
+          "FADD v16.4S, v0.4S, v1.4S\n"
+          "FSUB v17.4S, v0.4S, v1.4S\n"
+          "FADD v18.4S, v2.4S, v3.4S\n"
+          "FSUB v19.4S, v2.4S, v3.4S\n"
+          "FADD v20.4S, v4.4S, v5.4S\n"
+          "FSUB v21.4S, v4.4S, v5.4S\n"
+          "FADD v22.4S, v6.4S, v7.4S\n"
+          "FSUB v23.4S, v6.4S, v7.4S\n"
+          "FADD v24.4S, v8.4S, v9.4S\n"
+          "FSUB v25.4S, v8.4S, v9.4S\n"
+          "FADD v26.4S, v10.4S, v11.4S\n"
+          "FSUB v27.4S, v10.4S, v11.4S\n"
+          "FADD v28.4S, v12.4S, v13.4S\n"
+          "FSUB v29.4S, v12.4S, v13.4S\n"
+          "FADD v30.4S, v14.4S, v15.4S\n"
+          "FSUB v31.4S, v14.4S, v15.4S\n"
+          "FADD v0.4S, v16.4S, v18.4S\n"
+          "FSUB v2.4S, v16.4S, v18.4S\n"
+          "FADD v1.4S, v17.4S, v19.4S\n"
+          "FSUB v3.4S, v17.4S, v19.4S\n"
+          "FADD v4.4S, v20.4S, v22.4S\n"
+          "FSUB v6.4S, v20.4S, v22.4S\n"
+          "FADD v5.4S, v21.4S, v23.4S\n"
+          "FSUB v7.4S, v21.4S, v23.4S\n"
+          "FADD v8.4S, v24.4S, v26.4S\n"
+          "FSUB v10.4S, v24.4S, v26.4S\n"
+          "FADD v9.4S, v25.4S, v27.4S\n"
+          "FSUB v11.4S, v25.4S, v27.4S\n"
+          "FADD v12.4S, v28.4S, v30.4S\n"
+          "FSUB v14.4S, v28.4S, v30.4S\n"
+          "FADD v13.4S, v29.4S, v31.4S\n"
+          "FSUB v15.4S, v29.4S, v31.4S\n"
+          "FADD v16.4S, v0.4S, v4.4S\n"
+          "FSUB v20.4S, v0.4S, v4.4S\n"
+          "FADD v17.4S, v1.4S, v5.4S\n"
+          "FSUB v21.4S, v1.4S, v5.4S\n"
+          "FADD v18.4S, v2.4S, v6.4S\n"
+          "FSUB v22.4S, v2.4S, v6.4S\n"
+          "FADD v19.4S, v3.4S, v7.4S\n"
+          "FSUB v23.4S, v3.4S, v7.4S\n"
+          "FADD v24.4S, v8.4S, v12.4S\n"
+          "FSUB v28.4S, v8.4S, v12.4S\n"
+          "FADD v25.4S, v9.4S, v13.4S\n"
+          "FSUB v29.4S, v9.4S, v13.4S\n"
+          "FADD v26.4S, v10.4S, v14.4S\n"
+          "FSUB v30.4S, v10.4S, v14.4S\n"
+          "FADD v27.4S, v11.4S, v15.4S\n"
+          "FSUB v31.4S, v11.4S, v15.4S\n"
+          "FADD v0.4S, v16.4S, v24.4S\n"
+          "FSUB v8.4S, v16.4S, v24.4S\n"
+          "FADD v1.4S, v17.4S, v25.4S\n"
+          "FSUB v9.4S, v17.4S, v25.4S\n"
+          "FADD v2.4S, v18.4S, v26.4S\n"
+          "FSUB v10.4S, v18.4S, v26.4S\n"
+          "FADD v3.4S, v19.4S, v27.4S\n"
+          "FSUB v11.4S, v19.4S, v27.4S\n"
+          "FADD v4.4S, v20.4S, v28.4S\n"
+          "FSUB v12.4S, v20.4S, v28.4S\n"
+          "FADD v5.4S, v21.4S, v29.4S\n"
+          "FSUB v13.4S, v21.4S, v29.4S\n"
+          "FADD v6.4S, v22.4S, v30.4S\n"
+          "FSUB v14.4S, v22.4S, v30.4S\n"
+          "FADD v7.4S, v23.4S, v31.4S\n"
+          "FSUB v15.4S, v23.4S, v31.4S\n"
+          "ST1 {v0.4S}, [%0]\n"
+          "ST1 {v1.4S}, [%1]\n"
+          "ST1 {v2.4S}, [%2]\n"
+          "ST1 {v3.4S}, [%3]\n"
+          "ST1 {v4.4S}, [%4]\n"
+          "ST1 {v5.4S}, [%5]\n"
+          "ST1 {v6.4S}, [%6]\n"
+          "ST1 {v7.4S}, [%7]\n"
+          "ST1 {v8.4S}, [%8]\n"
+          "ST1 {v9.4S}, [%9]\n"
+          "ST1 {v10.4S}, [%10]\n"
+          "ST1 {v11.4S}, [%11]\n"
+          "ST1 {v12.4S}, [%12]\n"
+          "ST1 {v13.4S}, [%13]\n"
+          "ST1 {v14.4S}, [%14]\n"
+          "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
+          "r"(buf + j + k + 64),
+          "r"(buf + j + k + 128),
+          "r"(buf + j + k + 192),
+          "r"(buf + j + k + 256),
+          "r"(buf + j + k + 320),
+          "r"(buf + j + k + 384),
+          "r"(buf + j + k + 448),
+          "r"(buf + j + k + 512),
+          "r"(buf + j + k + 576),
+          "r"(buf + j + k + 640),
+          "r"(buf + j + k + 704),
+          "r"(buf + j + k + 768),
+          "r"(buf + j + k + 832),
+          "r"(buf + j + k + 896),
+          "r"(buf + j + k + 960)
+          : "%v0",
+            "%v1",
+            "%v2",
+            "%v3",
+            "%v4",
+            "%v5",
+            "%v6",
+            "%v7",
+            "%v8",
+            "%v9",
+            "%v10",
+            "%v11",
+            "%v12",
+            "%v13",
+            "%v14",
+            "%v15",
+            "%v16",
+            "%v17",
+            "%v18",
+            "%v19",
+            "%v20",
+            "%v21",
+            "%v22",
+            "%v23",
+            "%v24",
+            "%v25",
+            "%v26",
+            "%v27",
+            "%v28",
+            "%v29",
+            "%v30",
+            "%v31",
+            "memory");
+    }
+  }
+  for (int j = 0; j < 8192; j += 8192) {
+    for (int k = 0; k < 1024; k += 4) {
+      __asm__ volatile(
+          "LD1 {v0.4S}, [%0]\n"
+          "LD1 {v1.4S}, [%1]\n"
+          "LD1 {v2.4S}, [%2]\n"
+          "LD1 {v3.4S}, [%3]\n"
+          "LD1 {v4.4S}, [%4]\n"
+          "LD1 {v5.4S}, [%5]\n"
+          "LD1 {v6.4S}, [%6]\n"
+          "LD1 {v7.4S}, [%7]\n"
+          "FADD v16.4S, v0.4S, v1.4S\n"
+          "FSUB v17.4S, v0.4S, v1.4S\n"
+          "FADD v18.4S, v2.4S, v3.4S\n"
+          "FSUB v19.4S, v2.4S, v3.4S\n"
+          "FADD v20.4S, v4.4S, v5.4S\n"
+          "FSUB v21.4S, v4.4S, v5.4S\n"
+          "FADD v22.4S, v6.4S, v7.4S\n"
+          "FSUB v23.4S, v6.4S, v7.4S\n"
+          "FADD v0.4S, v16.4S, v18.4S\n"
+          "FSUB v2.4S, v16.4S, v18.4S\n"
+          "FADD v1.4S, v17.4S, v19.4S\n"
+          "FSUB v3.4S, v17.4S, v19.4S\n"
+          "FADD v4.4S, v20.4S, v22.4S\n"
+          "FSUB v6.4S, v20.4S, v22.4S\n"
+          "FADD v5.4S, v21.4S, v23.4S\n"
+          "FSUB v7.4S, v21.4S, v23.4S\n"
+          "FADD v16.4S, v0.4S, v4.4S\n"
+          "FSUB v20.4S, v0.4S, v4.4S\n"
+          "FADD v17.4S, v1.4S, v5.4S\n"
+          "FSUB v21.4S, v1.4S, v5.4S\n"
+          "FADD v18.4S, v2.4S, v6.4S\n"
+          "FSUB v22.4S, v2.4S, v6.4S\n"
+          "FADD v19.4S, v3.4S, v7.4S\n"
+          "FSUB v23.4S, v3.4S, v7.4S\n"
+          "ST1 {v16.4S}, [%0]\n"
+          "ST1 {v17.4S}, [%1]\n"
+          "ST1 {v18.4S}, [%2]\n"
+          "ST1 {v19.4S}, [%3]\n"
+          "ST1 {v20.4S}, [%4]\n"
+          "ST1 {v21.4S}, [%5]\n"
+          "ST1 {v22.4S}, [%6]\n"
+          "ST1 {v23.4S}, [%7]\n" ::"r"(buf + j + k + 0),
+          "r"(buf + j + k + 1024),
+          "r"(buf + j + k + 2048),
+          "r"(buf + j + k + 3072),
+          "r"(buf + j + k + 4096),
+          "r"(buf + j + k + 5120),
+          "r"(buf + j + k + 6144),
+          "r"(buf + j + k + 7168)
+          : "%v0",
+            "%v1",
+            "%v2",
+            "%v3",
+            "%v4",
+            "%v5",
+            "%v6",
+            "%v7",
+            "%v8",
+            "%v9",
+            "%v10",
+            "%v11",
+            "%v12",
+            "%v13",
+            "%v14",
+            "%v15",
+            "%v16",
+            "%v17",
+            "%v18",
+            "%v19",
+            "%v20",
+            "%v21",
+            "%v22",
+            "%v23",
+            "%v24",
+            "%v25",
+            "%v26",
+            "%v27",
+            "%v28",
+            "%v29",
+            "%v30",
+            "%v31",
+            "memory");
+    }
+  }
+}
+void helper_float_14_recursive(float* buf, int depth);
+void helper_float_14_recursive(float* buf, int depth) {
+  if (depth == 10) {
+    helper_float_10(buf);
+    return;
+  }
+  if (depth == 14) {
+    helper_float_14_recursive(buf + 0, 10);
+    helper_float_14_recursive(buf + 1024, 10);
+    helper_float_14_recursive(buf + 2048, 10);
+    helper_float_14_recursive(buf + 3072, 10);
+    helper_float_14_recursive(buf + 4096, 10);
+    helper_float_14_recursive(buf + 5120, 10);
+    helper_float_14_recursive(buf + 6144, 10);
+    helper_float_14_recursive(buf + 7168, 10);
+    helper_float_14_recursive(buf + 8192, 10);
+    helper_float_14_recursive(buf + 9216, 10);
+    helper_float_14_recursive(buf + 10240, 10);
+    helper_float_14_recursive(buf + 11264, 10);
+    helper_float_14_recursive(buf + 12288, 10);
+    helper_float_14_recursive(buf + 13312, 10);
+    helper_float_14_recursive(buf + 14336, 10);
+    helper_float_14_recursive(buf + 15360, 10);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 1024; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "LD1 {v4.4S}, [%4]\n"
+            "LD1 {v5.4S}, [%5]\n"
+            "LD1 {v6.4S}, [%6]\n"
+            "LD1 {v7.4S}, [%7]\n"
+            "LD1 {v8.4S}, [%8]\n"
+            "LD1 {v9.4S}, [%9]\n"
+            "LD1 {v10.4S}, [%10]\n"
+            "LD1 {v11.4S}, [%11]\n"
+            "LD1 {v12.4S}, [%12]\n"
+            "LD1 {v13.4S}, [%13]\n"
+            "LD1 {v14.4S}, [%14]\n"
+            "LD1 {v15.4S}, [%15]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v20.4S, v4.4S, v5.4S\n"
+            "FSUB v21.4S, v4.4S, v5.4S\n"
+            "FADD v22.4S, v6.4S, v7.4S\n"
+            "FSUB v23.4S, v6.4S, v7.4S\n"
+            "FADD v24.4S, v8.4S, v9.4S\n"
+            "FSUB v25.4S, v8.4S, v9.4S\n"
+            "FADD v26.4S, v10.4S, v11.4S\n"
+            "FSUB v27.4S, v10.4S, v11.4S\n"
+            "FADD v28.4S, v12.4S, v13.4S\n"
+            "FSUB v29.4S, v12.4S, v13.4S\n"
+            "FADD v30.4S, v14.4S, v15.4S\n"
+            "FSUB v31.4S, v14.4S, v15.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "FADD v4.4S, v20.4S, v22.4S\n"
+            "FSUB v6.4S, v20.4S, v22.4S\n"
+            "FADD v5.4S, v21.4S, v23.4S\n"
+            "FSUB v7.4S, v21.4S, v23.4S\n"
+            "FADD v8.4S, v24.4S, v26.4S\n"
+            "FSUB v10.4S, v24.4S, v26.4S\n"
+            "FADD v9.4S, v25.4S, v27.4S\n"
+            "FSUB v11.4S, v25.4S, v27.4S\n"
+            "FADD v12.4S, v28.4S, v30.4S\n"
+            "FSUB v14.4S, v28.4S, v30.4S\n"
+            "FADD v13.4S, v29.4S, v31.4S\n"
+            "FSUB v15.4S, v29.4S, v31.4S\n"
+            "FADD v16.4S, v0.4S, v4.4S\n"
+            "FSUB v20.4S, v0.4S, v4.4S\n"
+            "FADD v17.4S, v1.4S, v5.4S\n"
+            "FSUB v21.4S, v1.4S, v5.4S\n"
+            "FADD v18.4S, v2.4S, v6.4S\n"
+            "FSUB v22.4S, v2.4S, v6.4S\n"
+            "FADD v19.4S, v3.4S, v7.4S\n"
+            "FSUB v23.4S, v3.4S, v7.4S\n"
+            "FADD v24.4S, v8.4S, v12.4S\n"
+            "FSUB v28.4S, v8.4S, v12.4S\n"
+            "FADD v25.4S, v9.4S, v13.4S\n"
+            "FSUB v29.4S, v9.4S, v13.4S\n"
+            "FADD v26.4S, v10.4S, v14.4S\n"
+            "FSUB v30.4S, v10.4S, v14.4S\n"
+            "FADD v27.4S, v11.4S, v15.4S\n"
+            "FSUB v31.4S, v11.4S, v15.4S\n"
+            "FADD v0.4S, v16.4S, v24.4S\n"
+            "FSUB v8.4S, v16.4S, v24.4S\n"
+            "FADD v1.4S, v17.4S, v25.4S\n"
+            "FSUB v9.4S, v17.4S, v25.4S\n"
+            "FADD v2.4S, v18.4S, v26.4S\n"
+            "FSUB v10.4S, v18.4S, v26.4S\n"
+            "FADD v3.4S, v19.4S, v27.4S\n"
+            "FSUB v11.4S, v19.4S, v27.4S\n"
+            "FADD v4.4S, v20.4S, v28.4S\n"
+            "FSUB v12.4S, v20.4S, v28.4S\n"
+            "FADD v5.4S, v21.4S, v29.4S\n"
+            "FSUB v13.4S, v21.4S, v29.4S\n"
+            "FADD v6.4S, v22.4S, v30.4S\n"
+            "FSUB v14.4S, v22.4S, v30.4S\n"
+            "FADD v7.4S, v23.4S, v31.4S\n"
+            "FSUB v15.4S, v23.4S, v31.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n"
+            "ST1 {v4.4S}, [%4]\n"
+            "ST1 {v5.4S}, [%5]\n"
+            "ST1 {v6.4S}, [%6]\n"
+            "ST1 {v7.4S}, [%7]\n"
+            "ST1 {v8.4S}, [%8]\n"
+            "ST1 {v9.4S}, [%9]\n"
+            "ST1 {v10.4S}, [%10]\n"
+            "ST1 {v11.4S}, [%11]\n"
+            "ST1 {v12.4S}, [%12]\n"
+            "ST1 {v13.4S}, [%13]\n"
+            "ST1 {v14.4S}, [%14]\n"
+            "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 1024),
+            "r"(buf + j + k + 2048),
+            "r"(buf + j + k + 3072),
+            "r"(buf + j + k + 4096),
+            "r"(buf + j + k + 5120),
+            "r"(buf + j + k + 6144),
+            "r"(buf + j + k + 7168),
+            "r"(buf + j + k + 8192),
+            "r"(buf + j + k + 9216),
+            "r"(buf + j + k + 10240),
+            "r"(buf + j + k + 11264),
+            "r"(buf + j + k + 12288),
+            "r"(buf + j + k + 13312),
+            "r"(buf + j + k + 14336),
+            "r"(buf + j + k + 15360)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_14(float* buf);
+void helper_float_14(float* buf) {
+  helper_float_14_recursive(buf, 14);
+}
+void helper_float_15_recursive(float* buf, int depth);
+void helper_float_15_recursive(float* buf, int depth) {
+  if (depth == 13) {
+    helper_float_13(buf);
+    return;
+  }
+  if (depth == 15) {
+    helper_float_15_recursive(buf + 0, 13);
+    helper_float_15_recursive(buf + 8192, 13);
+    helper_float_15_recursive(buf + 16384, 13);
+    helper_float_15_recursive(buf + 24576, 13);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 8192),
+            "r"(buf + j + k + 16384),
+            "r"(buf + j + k + 24576)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_15(float* buf);
+void helper_float_15(float* buf) {
+  helper_float_15_recursive(buf, 15);
+}
+void helper_float_16_recursive(float* buf, int depth);
+void helper_float_16_recursive(float* buf, int depth) {
+  if (depth == 15) {
+    helper_float_15(buf);
+    return;
+  }
+  if (depth == 16) {
+    helper_float_16_recursive(buf + 0, 15);
+    helper_float_16_recursive(buf + 32768, 15);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 32768)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_16(float* buf);
+void helper_float_16(float* buf) {
+  helper_float_16_recursive(buf, 16);
+}
+void helper_float_17_recursive(float* buf, int depth);
+void helper_float_17_recursive(float* buf, int depth) {
+  if (depth == 15) {
+    helper_float_15(buf);
+    return;
+  }
+  if (depth == 17) {
+    helper_float_17_recursive(buf + 0, 15);
+    helper_float_17_recursive(buf + 32768, 15);
+    helper_float_17_recursive(buf + 65536, 15);
+    helper_float_17_recursive(buf + 98304, 15);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 32768),
+            "r"(buf + j + k + 65536),
+            "r"(buf + j + k + 98304)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_17(float* buf);
+void helper_float_17(float* buf) {
+  helper_float_17_recursive(buf, 17);
+}
+void helper_float_18_recursive(float* buf, int depth);
+void helper_float_18_recursive(float* buf, int depth) {
+  if (depth == 17) {
+    helper_float_17(buf);
+    return;
+  }
+  if (depth == 18) {
+    helper_float_18_recursive(buf + 0, 17);
+    helper_float_18_recursive(buf + 131072, 17);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 131072)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_18(float* buf);
+void helper_float_18(float* buf) {
+  helper_float_18_recursive(buf, 18);
+}
+void helper_float_19_recursive(float* buf, int depth);
+void helper_float_19_recursive(float* buf, int depth) {
+  if (depth == 18) {
+    helper_float_18(buf);
+    return;
+  }
+  if (depth == 19) {
+    helper_float_19_recursive(buf + 0, 18);
+    helper_float_19_recursive(buf + 262144, 18);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 262144)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_19(float* buf);
+void helper_float_19(float* buf) {
+  helper_float_19_recursive(buf, 19);
+}
+void helper_float_20_recursive(float* buf, int depth);
+void helper_float_20_recursive(float* buf, int depth) {
+  if (depth == 18) {
+    helper_float_18(buf);
+    return;
+  }
+  if (depth == 20) {
+    helper_float_20_recursive(buf + 0, 18);
+    helper_float_20_recursive(buf + 262144, 18);
+    helper_float_20_recursive(buf + 524288, 18);
+    helper_float_20_recursive(buf + 786432, 18);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 262144),
+            "r"(buf + j + k + 524288),
+            "r"(buf + j + k + 786432)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_20(float* buf);
+void helper_float_20(float* buf) {
+  helper_float_20_recursive(buf, 20);
+}
+void helper_float_21_recursive(float* buf, int depth);
+void helper_float_21_recursive(float* buf, int depth) {
+  if (depth == 20) {
+    helper_float_20(buf);
+    return;
+  }
+  if (depth == 21) {
+    helper_float_21_recursive(buf + 0, 20);
+    helper_float_21_recursive(buf + 1048576, 20);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 1048576)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_21(float* buf);
+void helper_float_21(float* buf) {
+  helper_float_21_recursive(buf, 21);
+}
+void helper_float_22_recursive(float* buf, int depth);
+void helper_float_22_recursive(float* buf, int depth) {
+  if (depth == 20) {
+    helper_float_20(buf);
+    return;
+  }
+  if (depth == 22) {
+    helper_float_22_recursive(buf + 0, 20);
+    helper_float_22_recursive(buf + 1048576, 20);
+    helper_float_22_recursive(buf + 2097152, 20);
+    helper_float_22_recursive(buf + 3145728, 20);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 1048576),
+            "r"(buf + j + k + 2097152),
+            "r"(buf + j + k + 3145728)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_22(float* buf);
+void helper_float_22(float* buf) {
+  helper_float_22_recursive(buf, 22);
+}
+void helper_float_23_recursive(float* buf, int depth);
+void helper_float_23_recursive(float* buf, int depth) {
+  if (depth == 22) {
+    helper_float_22(buf);
+    return;
+  }
+  if (depth == 23) {
+    helper_float_23_recursive(buf + 0, 22);
+    helper_float_23_recursive(buf + 4194304, 22);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 4194304; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 4194304)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_23(float* buf);
+void helper_float_23(float* buf) {
+  helper_float_23_recursive(buf, 23);
+}
+void helper_float_24_recursive(float* buf, int depth);
+void helper_float_24_recursive(float* buf, int depth) {
+  if (depth == 23) {
+    helper_float_23(buf);
+    return;
+  }
+  if (depth == 24) {
+    helper_float_24_recursive(buf + 0, 23);
+    helper_float_24_recursive(buf + 8388608, 23);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 8388608)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_24(float* buf);
+void helper_float_24(float* buf) {
+  helper_float_24_recursive(buf, 24);
+}
+void helper_float_25_recursive(float* buf, int depth);
+void helper_float_25_recursive(float* buf, int depth) {
+  if (depth == 23) {
+    helper_float_23(buf);
+    return;
+  }
+  if (depth == 25) {
+    helper_float_25_recursive(buf + 0, 23);
+    helper_float_25_recursive(buf + 8388608, 23);
+    helper_float_25_recursive(buf + 16777216, 23);
+    helper_float_25_recursive(buf + 25165824, 23);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 8388608),
+            "r"(buf + j + k + 16777216),
+            "r"(buf + j + k + 25165824)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_25(float* buf);
+void helper_float_25(float* buf) {
+  helper_float_25_recursive(buf, 25);
+}
+void helper_float_26_recursive(float* buf, int depth);
+void helper_float_26_recursive(float* buf, int depth) {
+  if (depth == 25) {
+    helper_float_25(buf);
+    return;
+  }
+  if (depth == 26) {
+    helper_float_26_recursive(buf + 0, 25);
+    helper_float_26_recursive(buf + 33554432, 25);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 33554432; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 33554432)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_26(float* buf);
+void helper_float_26(float* buf) {
+  helper_float_26_recursive(buf, 26);
+}
+void helper_float_27_recursive(float* buf, int depth);
+void helper_float_27_recursive(float* buf, int depth) {
+  if (depth == 26) {
+    helper_float_26(buf);
+    return;
+  }
+  if (depth == 27) {
+    helper_float_27_recursive(buf + 0, 26);
+    helper_float_27_recursive(buf + 67108864, 26);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 67108864; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 67108864)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_27(float* buf);
+void helper_float_27(float* buf) {
+  helper_float_27_recursive(buf, 27);
+}
+void helper_float_28_recursive(float* buf, int depth);
+void helper_float_28_recursive(float* buf, int depth) {
+  if (depth == 26) {
+    helper_float_26(buf);
+    return;
+  }
+  if (depth == 28) {
+    helper_float_28_recursive(buf + 0, 26);
+    helper_float_28_recursive(buf + 67108864, 26);
+    helper_float_28_recursive(buf + 134217728, 26);
+    helper_float_28_recursive(buf + 201326592, 26);
+    for (int j = 0; j < 268435456; j += 268435456) {
+      for (int k = 0; k < 67108864; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 67108864),
+            "r"(buf + j + k + 134217728),
+            "r"(buf + j + k + 201326592)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_28(float* buf);
+void helper_float_28(float* buf) {
+  helper_float_28_recursive(buf, 28);
+}
+void helper_float_29_recursive(float* buf, int depth);
+void helper_float_29_recursive(float* buf, int depth) {
+  if (depth == 28) {
+    helper_float_28(buf);
+    return;
+  }
+  if (depth == 29) {
+    helper_float_29_recursive(buf + 0, 28);
+    helper_float_29_recursive(buf + 268435456, 28);
+    for (int j = 0; j < 536870912; j += 536870912) {
+      for (int k = 0; k < 268435456; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 268435456)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_29(float* buf);
+void helper_float_29(float* buf) {
+  helper_float_29_recursive(buf, 29);
+}
+void helper_float_30_recursive(float* buf, int depth);
+void helper_float_30_recursive(float* buf, int depth) {
+  if (depth == 29) {
+    helper_float_29(buf);
+    return;
+  }
+  if (depth == 30) {
+    helper_float_30_recursive(buf + 0, 29);
+    helper_float_30_recursive(buf + 536870912, 29);
+    for (int j = 0; j < 1073741824; j += 1073741824) {
+      for (int k = 0; k < 536870912; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 536870912)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_30(float* buf);
+void helper_float_30(float* buf) {
+  helper_float_30_recursive(buf, 30);
+}
+int fht_float(float* buf, int log_n) {
+  if (log_n == 0) {
+    return 0;
+  }
+  if (log_n == 1) {
+    helper_float_1(buf);
+    return 0;
+  }
+  if (log_n == 2) {
+    helper_float_2(buf);
+    return 0;
+  }
+  if (log_n == 3) {
+    helper_float_3(buf);
+    return 0;
+  }
+  if (log_n == 4) {
+    helper_float_4(buf);
+    return 0;
+  }
+  if (log_n == 5) {
+    helper_float_5(buf);
+    return 0;
+  }
+  if (log_n == 6) {
+    helper_float_6(buf);
+    return 0;
+  }
+  if (log_n == 7) {
+    helper_float_7(buf);
+    return 0;
+  }
+  if (log_n == 8) {
+    helper_float_8(buf);
+    return 0;
+  }
+  if (log_n == 9) {
+    helper_float_9(buf);
+    return 0;
+  }
+  if (log_n == 10) {
+    helper_float_10(buf);
+    return 0;
+  }
+  if (log_n == 11) {
+    helper_float_11(buf);
+    return 0;
+  }
+  if (log_n == 12) {
+    helper_float_12(buf);
+    return 0;
+  }
+  if (log_n == 13) {
+    helper_float_13(buf);
+    return 0;
+  }
+  if (log_n == 14) {
+    helper_float_14(buf);
+    return 0;
+  }
+  if (log_n == 15) {
+    helper_float_15(buf);
+    return 0;
+  }
+  if (log_n == 16) {
+    helper_float_16(buf);
+    return 0;
+  }
+  if (log_n == 17) {
+    helper_float_17(buf);
+    return 0;
+  }
+  if (log_n == 18) {
+    helper_float_18(buf);
+    return 0;
+  }
+  if (log_n == 19) {
+    helper_float_19(buf);
+    return 0;
+  }
+  if (log_n == 20) {
+    helper_float_20(buf);
+    return 0;
+  }
+  if (log_n == 21) {
+    helper_float_21(buf);
+    return 0;
+  }
+  if (log_n == 22) {
+    helper_float_22(buf);
+    return 0;
+  }
+  if (log_n == 23) {
+    helper_float_23(buf);
+    return 0;
+  }
+  if (log_n == 24) {
+    helper_float_24(buf);
+    return 0;
+  }
+  if (log_n == 25) {
+    helper_float_25(buf);
+    return 0;
+  }
+  if (log_n == 26) {
+    helper_float_26(buf);
+    return 0;
+  }
+  if (log_n == 27) {
+    helper_float_27(buf);
+    return 0;
+  }
+  if (log_n == 28) {
+    helper_float_28(buf);
+    return 0;
+  }
+  if (log_n == 29) {
+    helper_float_29(buf);
+    return 0;
+  }
+  if (log_n == 30) {
+    helper_float_30(buf);
+    return 0;
+  }
+  return 1;
+}
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_sse.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_sse.c
new file mode 100644
index 00000000000..90d0ffc8180
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_sse.c
@@ -0,0 +1,26215 @@
+#include "fht.h"
+static inline void helper_float_1(float *buf);
+static inline void helper_float_1(float *buf) {
+  for (int j = 0; j < 2; j += 2) {
+    for (int k = 0; k < 1; ++k) {
+      float u = buf[j + k];
+      float v = buf[j + k + 1];
+      buf[j + k] = u + v;
+      buf[j + k + 1] = u - v;
+    }
+  }
+}
+static inline void helper_float_2(float *buf);
+static inline void helper_float_2(float *buf) {
+  for (int j = 0; j < 4; j += 4) {
+    __asm__ volatile (
+      "movups (%0), %%xmm0\n"
+      "movaps %%xmm0, %%xmm8\n"
+      "shufps $160, %%xmm8, %%xmm8\n"
+      "shufps $245, %%xmm0, %%xmm0\n"
+      "xorps %%xmm9, %%xmm9\n"
+      "subps %%xmm0, %%xmm9\n"
+      "addsubps %%xmm9, %%xmm8\n"
+      "movaps %%xmm8, %%xmm0\n"
+      "movaps %%xmm0, %%xmm8\n"
+      "shufps $68, %%xmm8, %%xmm8\n"
+      "xorps %%xmm9, %%xmm9\n"
+      "movaps %%xmm0, %%xmm10\n"
+      "shufps $14, %%xmm9, %%xmm10\n"
+      "movaps %%xmm0, %%xmm11\n"
+      "shufps $224, %%xmm11, %%xmm9\n"
+      "addps %%xmm8, %%xmm10\n"
+      "subps %%xmm9, %%xmm10\n"
+      "movaps %%xmm10, %%xmm0\n"
+      "movups %%xmm0, (%0)\n"
+      :: "r"(buf + j) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+    );
+  }
+}
+static inline void helper_float_3(float *buf);
+static inline void helper_float_3(float *buf) {
+  for (int j = 0; j < 8; j += 8) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_4(float *buf);
+static inline void helper_float_4(float *buf) {
+  for (int j = 0; j < 16; j += 16) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movups %%xmm0, (%0)\n"
+        "movups %%xmm1, (%1)\n"
+        "movups %%xmm2, (%2)\n"
+        "movups %%xmm3, (%3)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_5(float *buf);
+static inline void helper_float_5(float *buf) {
+  for (int j = 0; j < 32; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm4, %%xmm4\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm4, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm5, %%xmm5\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm5, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm6, %%xmm6\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm6, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm7, %%xmm7\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm7, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm4, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm4, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm5, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm5, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm6, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm6, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm7, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm7, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_6(float *buf);
+static inline void helper_float_6(float *buf) {
+  for (int j = 0; j < 64; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm4, %%xmm4\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm4, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm5, %%xmm5\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm5, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm6, %%xmm6\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm6, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm7, %%xmm7\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm7, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm4, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm4, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm5, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm5, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm6, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm6, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm7, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm7, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 64; j += 64) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+void helper_float_7_recursive(float *buf, int depth);
+void helper_float_7_recursive(float *buf, int depth) {
+  if (depth == 7) {
+    for (int j = 0; j < 128; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_7(float *buf);
+void helper_float_7(float *buf) {
+  helper_float_7_recursive(buf, 7);
+}
+static inline void helper_float_8(float *buf);
+static inline void helper_float_8(float *buf) {
+  for (int j = 0; j < 256; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm4, %%xmm4\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm4, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm5, %%xmm5\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm5, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm6, %%xmm6\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm6, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm7, %%xmm7\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm7, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm4, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm4, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm5, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm5, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm6, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm6, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm7, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm7, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 256; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_9(float *buf);
+static inline void helper_float_9(float *buf) {
+  for (int j = 0; j < 512; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm4, %%xmm4\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm4, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm5, %%xmm5\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm5, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm6, %%xmm6\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm6, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm7, %%xmm7\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm7, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm4, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm4, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm5, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm5, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm6, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm6, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm7, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm7, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 512; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 512; j += 512) {
+    for (int k = 0; k < 256; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_10(float *buf);
+static inline void helper_float_10(float *buf) {
+  for (int j = 0; j < 1024; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm4, %%xmm4\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm4, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm5, %%xmm5\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm5, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm6, %%xmm6\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm6, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm7, %%xmm7\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm7, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm4, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm4, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm5, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm5, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm6, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm6, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm7, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm7, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 1024; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 1024; j += 1024) {
+    for (int k = 0; k < 256; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movups %%xmm0, (%0)\n"
+        "movups %%xmm1, (%1)\n"
+        "movups %%xmm2, (%2)\n"
+        "movups %%xmm3, (%3)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_11(float *buf);
+static inline void helper_float_11(float *buf) {
+  for (int j = 0; j < 2048; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm4, %%xmm4\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm4, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm5, %%xmm5\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm5, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm6, %%xmm6\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm6, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm7, %%xmm7\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm7, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm4, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm4, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm5, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm5, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm6, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm6, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm7, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm7, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 2048; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 2048; j += 2048) {
+    for (int k = 0; k < 256; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+void helper_float_12_recursive(float *buf, int depth);
+void helper_float_12_recursive(float *buf, int depth) {
+  if (depth == 7) {
+    for (int j = 0; j < 128; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 10) {
+    helper_float_12_recursive(buf + 0, 7);
+    helper_float_12_recursive(buf + 128, 7);
+    helper_float_12_recursive(buf + 256, 7);
+    helper_float_12_recursive(buf + 384, 7);
+    helper_float_12_recursive(buf + 512, 7);
+    helper_float_12_recursive(buf + 640, 7);
+    helper_float_12_recursive(buf + 768, 7);
+    helper_float_12_recursive(buf + 896, 7);
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_float_12_recursive(buf + 0, 10);
+    helper_float_12_recursive(buf + 1024, 10);
+    helper_float_12_recursive(buf + 2048, 10);
+    helper_float_12_recursive(buf + 3072, 10);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 1024; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_12(float *buf);
+void helper_float_12(float *buf) {
+  helper_float_12_recursive(buf, 12);
+}
+void helper_float_13_recursive(float *buf, int depth);
+void helper_float_13_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_float_13_recursive(buf + 0, 11);
+    helper_float_13_recursive(buf + 2048, 11);
+    helper_float_13_recursive(buf + 4096, 11);
+    helper_float_13_recursive(buf + 6144, 11);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_13(float *buf);
+void helper_float_13(float *buf) {
+  helper_float_13_recursive(buf, 13);
+}
+void helper_float_14_recursive(float *buf, int depth);
+void helper_float_14_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_14_recursive(buf + 0, 11);
+    helper_float_14_recursive(buf + 2048, 11);
+    helper_float_14_recursive(buf + 4096, 11);
+    helper_float_14_recursive(buf + 6144, 11);
+    helper_float_14_recursive(buf + 8192, 11);
+    helper_float_14_recursive(buf + 10240, 11);
+    helper_float_14_recursive(buf + 12288, 11);
+    helper_float_14_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_14(float *buf);
+void helper_float_14(float *buf) {
+  helper_float_14_recursive(buf, 14);
+}
+void helper_float_15_recursive(float *buf, int depth);
+void helper_float_15_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_15_recursive(buf + 0, 13);
+    helper_float_15_recursive(buf + 8192, 13);
+    helper_float_15_recursive(buf + 16384, 13);
+    helper_float_15_recursive(buf + 24576, 13);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_15(float *buf);
+void helper_float_15(float *buf) {
+  helper_float_15_recursive(buf, 15);
+}
+void helper_float_16_recursive(float *buf, int depth);
+void helper_float_16_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_16_recursive(buf + 0, 11);
+    helper_float_16_recursive(buf + 2048, 11);
+    helper_float_16_recursive(buf + 4096, 11);
+    helper_float_16_recursive(buf + 6144, 11);
+    helper_float_16_recursive(buf + 8192, 11);
+    helper_float_16_recursive(buf + 10240, 11);
+    helper_float_16_recursive(buf + 12288, 11);
+    helper_float_16_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_16_recursive(buf + 0, 14);
+    helper_float_16_recursive(buf + 16384, 14);
+    helper_float_16_recursive(buf + 32768, 14);
+    helper_float_16_recursive(buf + 49152, 14);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_16(float *buf);
+void helper_float_16(float *buf) {
+  helper_float_16_recursive(buf, 16);
+}
+void helper_float_17_recursive(float *buf, int depth);
+void helper_float_17_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_17_recursive(buf + 0, 11);
+    helper_float_17_recursive(buf + 2048, 11);
+    helper_float_17_recursive(buf + 4096, 11);
+    helper_float_17_recursive(buf + 6144, 11);
+    helper_float_17_recursive(buf + 8192, 11);
+    helper_float_17_recursive(buf + 10240, 11);
+    helper_float_17_recursive(buf + 12288, 11);
+    helper_float_17_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_17_recursive(buf + 0, 14);
+    helper_float_17_recursive(buf + 16384, 14);
+    helper_float_17_recursive(buf + 32768, 14);
+    helper_float_17_recursive(buf + 49152, 14);
+    helper_float_17_recursive(buf + 65536, 14);
+    helper_float_17_recursive(buf + 81920, 14);
+    helper_float_17_recursive(buf + 98304, 14);
+    helper_float_17_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_17(float *buf);
+void helper_float_17(float *buf) {
+  helper_float_17_recursive(buf, 17);
+}
+void helper_float_18_recursive(float *buf, int depth);
+void helper_float_18_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_18_recursive(buf + 0, 13);
+    helper_float_18_recursive(buf + 8192, 13);
+    helper_float_18_recursive(buf + 16384, 13);
+    helper_float_18_recursive(buf + 24576, 13);
+    helper_float_18_recursive(buf + 32768, 13);
+    helper_float_18_recursive(buf + 40960, 13);
+    helper_float_18_recursive(buf + 49152, 13);
+    helper_float_18_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_18_recursive(buf + 0, 16);
+    helper_float_18_recursive(buf + 65536, 16);
+    helper_float_18_recursive(buf + 131072, 16);
+    helper_float_18_recursive(buf + 196608, 16);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 65536; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_18(float *buf);
+void helper_float_18(float *buf) {
+  helper_float_18_recursive(buf, 18);
+}
+void helper_float_19_recursive(float *buf, int depth);
+void helper_float_19_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_19_recursive(buf + 0, 13);
+    helper_float_19_recursive(buf + 8192, 13);
+    helper_float_19_recursive(buf + 16384, 13);
+    helper_float_19_recursive(buf + 24576, 13);
+    helper_float_19_recursive(buf + 32768, 13);
+    helper_float_19_recursive(buf + 40960, 13);
+    helper_float_19_recursive(buf + 49152, 13);
+    helper_float_19_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_float_19_recursive(buf + 0, 16);
+    helper_float_19_recursive(buf + 65536, 16);
+    helper_float_19_recursive(buf + 131072, 16);
+    helper_float_19_recursive(buf + 196608, 16);
+    helper_float_19_recursive(buf + 262144, 16);
+    helper_float_19_recursive(buf + 327680, 16);
+    helper_float_19_recursive(buf + 393216, 16);
+    helper_float_19_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_19(float *buf);
+void helper_float_19(float *buf) {
+  helper_float_19_recursive(buf, 19);
+}
+void helper_float_20_recursive(float *buf, int depth);
+void helper_float_20_recursive(float *buf, int depth) {
+  if (depth == 8) {
+    for (int j = 0; j < 256; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_float_20_recursive(buf + 0, 8);
+    helper_float_20_recursive(buf + 256, 8);
+    helper_float_20_recursive(buf + 512, 8);
+    helper_float_20_recursive(buf + 768, 8);
+    helper_float_20_recursive(buf + 1024, 8);
+    helper_float_20_recursive(buf + 1280, 8);
+    helper_float_20_recursive(buf + 1536, 8);
+    helper_float_20_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_20_recursive(buf + 0, 11);
+    helper_float_20_recursive(buf + 2048, 11);
+    helper_float_20_recursive(buf + 4096, 11);
+    helper_float_20_recursive(buf + 6144, 11);
+    helper_float_20_recursive(buf + 8192, 11);
+    helper_float_20_recursive(buf + 10240, 11);
+    helper_float_20_recursive(buf + 12288, 11);
+    helper_float_20_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_20_recursive(buf + 0, 14);
+    helper_float_20_recursive(buf + 16384, 14);
+    helper_float_20_recursive(buf + 32768, 14);
+    helper_float_20_recursive(buf + 49152, 14);
+    helper_float_20_recursive(buf + 65536, 14);
+    helper_float_20_recursive(buf + 81920, 14);
+    helper_float_20_recursive(buf + 98304, 14);
+    helper_float_20_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_float_20_recursive(buf + 0, 17);
+    helper_float_20_recursive(buf + 131072, 17);
+    helper_float_20_recursive(buf + 262144, 17);
+    helper_float_20_recursive(buf + 393216, 17);
+    helper_float_20_recursive(buf + 524288, 17);
+    helper_float_20_recursive(buf + 655360, 17);
+    helper_float_20_recursive(buf + 786432, 17);
+    helper_float_20_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_20(float *buf);
+void helper_float_20(float *buf) {
+  helper_float_20_recursive(buf, 20);
+}
+void helper_float_21_recursive(float *buf, int depth);
+void helper_float_21_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_21_recursive(buf + 0, 13);
+    helper_float_21_recursive(buf + 8192, 13);
+    helper_float_21_recursive(buf + 16384, 13);
+    helper_float_21_recursive(buf + 24576, 13);
+    helper_float_21_recursive(buf + 32768, 13);
+    helper_float_21_recursive(buf + 40960, 13);
+    helper_float_21_recursive(buf + 49152, 13);
+    helper_float_21_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_float_21_recursive(buf + 0, 16);
+    helper_float_21_recursive(buf + 65536, 16);
+    helper_float_21_recursive(buf + 131072, 16);
+    helper_float_21_recursive(buf + 196608, 16);
+    helper_float_21_recursive(buf + 262144, 16);
+    helper_float_21_recursive(buf + 327680, 16);
+    helper_float_21_recursive(buf + 393216, 16);
+    helper_float_21_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_21_recursive(buf + 0, 19);
+    helper_float_21_recursive(buf + 524288, 19);
+    helper_float_21_recursive(buf + 1048576, 19);
+    helper_float_21_recursive(buf + 1572864, 19);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 524288; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_21(float *buf);
+void helper_float_21(float *buf) {
+  helper_float_21_recursive(buf, 21);
+}
+void helper_float_22_recursive(float *buf, int depth);
+void helper_float_22_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_22_recursive(buf + 0, 11);
+    helper_float_22_recursive(buf + 2048, 11);
+    helper_float_22_recursive(buf + 4096, 11);
+    helper_float_22_recursive(buf + 6144, 11);
+    helper_float_22_recursive(buf + 8192, 11);
+    helper_float_22_recursive(buf + 10240, 11);
+    helper_float_22_recursive(buf + 12288, 11);
+    helper_float_22_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_22_recursive(buf + 0, 14);
+    helper_float_22_recursive(buf + 16384, 14);
+    helper_float_22_recursive(buf + 32768, 14);
+    helper_float_22_recursive(buf + 49152, 14);
+    helper_float_22_recursive(buf + 65536, 14);
+    helper_float_22_recursive(buf + 81920, 14);
+    helper_float_22_recursive(buf + 98304, 14);
+    helper_float_22_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_float_22_recursive(buf + 0, 17);
+    helper_float_22_recursive(buf + 131072, 17);
+    helper_float_22_recursive(buf + 262144, 17);
+    helper_float_22_recursive(buf + 393216, 17);
+    helper_float_22_recursive(buf + 524288, 17);
+    helper_float_22_recursive(buf + 655360, 17);
+    helper_float_22_recursive(buf + 786432, 17);
+    helper_float_22_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_float_22_recursive(buf + 0, 20);
+    helper_float_22_recursive(buf + 1048576, 20);
+    helper_float_22_recursive(buf + 2097152, 20);
+    helper_float_22_recursive(buf + 3145728, 20);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_22(float *buf);
+void helper_float_22(float *buf) {
+  helper_float_22_recursive(buf, 22);
+}
+void helper_float_23_recursive(float *buf, int depth);
+void helper_float_23_recursive(float *buf, int depth) {
+  if (depth == 6) {
+    for (int j = 0; j < 64; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 9) {
+    helper_float_23_recursive(buf + 0, 6);
+    helper_float_23_recursive(buf + 64, 6);
+    helper_float_23_recursive(buf + 128, 6);
+    helper_float_23_recursive(buf + 192, 6);
+    helper_float_23_recursive(buf + 256, 6);
+    helper_float_23_recursive(buf + 320, 6);
+    helper_float_23_recursive(buf + 384, 6);
+    helper_float_23_recursive(buf + 448, 6);
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_float_23_recursive(buf + 0, 9);
+    helper_float_23_recursive(buf + 512, 9);
+    helper_float_23_recursive(buf + 1024, 9);
+    helper_float_23_recursive(buf + 1536, 9);
+    helper_float_23_recursive(buf + 2048, 9);
+    helper_float_23_recursive(buf + 2560, 9);
+    helper_float_23_recursive(buf + 3072, 9);
+    helper_float_23_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_23_recursive(buf + 0, 12);
+    helper_float_23_recursive(buf + 4096, 12);
+    helper_float_23_recursive(buf + 8192, 12);
+    helper_float_23_recursive(buf + 12288, 12);
+    helper_float_23_recursive(buf + 16384, 12);
+    helper_float_23_recursive(buf + 20480, 12);
+    helper_float_23_recursive(buf + 24576, 12);
+    helper_float_23_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_23_recursive(buf + 0, 15);
+    helper_float_23_recursive(buf + 32768, 15);
+    helper_float_23_recursive(buf + 65536, 15);
+    helper_float_23_recursive(buf + 98304, 15);
+    helper_float_23_recursive(buf + 131072, 15);
+    helper_float_23_recursive(buf + 163840, 15);
+    helper_float_23_recursive(buf + 196608, 15);
+    helper_float_23_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_23_recursive(buf + 0, 18);
+    helper_float_23_recursive(buf + 262144, 18);
+    helper_float_23_recursive(buf + 524288, 18);
+    helper_float_23_recursive(buf + 786432, 18);
+    helper_float_23_recursive(buf + 1048576, 18);
+    helper_float_23_recursive(buf + 1310720, 18);
+    helper_float_23_recursive(buf + 1572864, 18);
+    helper_float_23_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_float_23_recursive(buf + 0, 21);
+    helper_float_23_recursive(buf + 2097152, 21);
+    helper_float_23_recursive(buf + 4194304, 21);
+    helper_float_23_recursive(buf + 6291456, 21);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_23(float *buf);
+void helper_float_23(float *buf) {
+  helper_float_23_recursive(buf, 23);
+}
+void helper_float_24_recursive(float *buf, int depth);
+void helper_float_24_recursive(float *buf, int depth) {
+  if (depth == 15) {
+    for (int j = 0; j < 32768; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 32768; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 32768; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 32768; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_24_recursive(buf + 0, 15);
+    helper_float_24_recursive(buf + 32768, 15);
+    helper_float_24_recursive(buf + 65536, 15);
+    helper_float_24_recursive(buf + 98304, 15);
+    helper_float_24_recursive(buf + 131072, 15);
+    helper_float_24_recursive(buf + 163840, 15);
+    helper_float_24_recursive(buf + 196608, 15);
+    helper_float_24_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_24_recursive(buf + 0, 18);
+    helper_float_24_recursive(buf + 262144, 18);
+    helper_float_24_recursive(buf + 524288, 18);
+    helper_float_24_recursive(buf + 786432, 18);
+    helper_float_24_recursive(buf + 1048576, 18);
+    helper_float_24_recursive(buf + 1310720, 18);
+    helper_float_24_recursive(buf + 1572864, 18);
+    helper_float_24_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_24_recursive(buf + 0, 21);
+    helper_float_24_recursive(buf + 2097152, 21);
+    helper_float_24_recursive(buf + 4194304, 21);
+    helper_float_24_recursive(buf + 6291456, 21);
+    helper_float_24_recursive(buf + 8388608, 21);
+    helper_float_24_recursive(buf + 10485760, 21);
+    helper_float_24_recursive(buf + 12582912, 21);
+    helper_float_24_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_24(float *buf);
+void helper_float_24(float *buf) {
+  helper_float_24_recursive(buf, 24);
+}
+void helper_float_25_recursive(float *buf, int depth);
+void helper_float_25_recursive(float *buf, int depth) {
+  if (depth == 8) {
+    for (int j = 0; j < 256; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_float_25_recursive(buf + 0, 8);
+    helper_float_25_recursive(buf + 256, 8);
+    helper_float_25_recursive(buf + 512, 8);
+    helper_float_25_recursive(buf + 768, 8);
+    helper_float_25_recursive(buf + 1024, 8);
+    helper_float_25_recursive(buf + 1280, 8);
+    helper_float_25_recursive(buf + 1536, 8);
+    helper_float_25_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_25_recursive(buf + 0, 11);
+    helper_float_25_recursive(buf + 2048, 11);
+    helper_float_25_recursive(buf + 4096, 11);
+    helper_float_25_recursive(buf + 6144, 11);
+    helper_float_25_recursive(buf + 8192, 11);
+    helper_float_25_recursive(buf + 10240, 11);
+    helper_float_25_recursive(buf + 12288, 11);
+    helper_float_25_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_25_recursive(buf + 0, 14);
+    helper_float_25_recursive(buf + 16384, 14);
+    helper_float_25_recursive(buf + 32768, 14);
+    helper_float_25_recursive(buf + 49152, 14);
+    helper_float_25_recursive(buf + 65536, 14);
+    helper_float_25_recursive(buf + 81920, 14);
+    helper_float_25_recursive(buf + 98304, 14);
+    helper_float_25_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_float_25_recursive(buf + 0, 17);
+    helper_float_25_recursive(buf + 131072, 17);
+    helper_float_25_recursive(buf + 262144, 17);
+    helper_float_25_recursive(buf + 393216, 17);
+    helper_float_25_recursive(buf + 524288, 17);
+    helper_float_25_recursive(buf + 655360, 17);
+    helper_float_25_recursive(buf + 786432, 17);
+    helper_float_25_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_float_25_recursive(buf + 0, 20);
+    helper_float_25_recursive(buf + 1048576, 20);
+    helper_float_25_recursive(buf + 2097152, 20);
+    helper_float_25_recursive(buf + 3145728, 20);
+    helper_float_25_recursive(buf + 4194304, 20);
+    helper_float_25_recursive(buf + 5242880, 20);
+    helper_float_25_recursive(buf + 6291456, 20);
+    helper_float_25_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 25) {
+    helper_float_25_recursive(buf + 0, 23);
+    helper_float_25_recursive(buf + 8388608, 23);
+    helper_float_25_recursive(buf + 16777216, 23);
+    helper_float_25_recursive(buf + 25165824, 23);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_25(float *buf);
+void helper_float_25(float *buf) {
+  helper_float_25_recursive(buf, 25);
+}
+void helper_float_26_recursive(float *buf, int depth);
+void helper_float_26_recursive(float *buf, int depth) {
+  if (depth == 5) {
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_float_26_recursive(buf + 0, 5);
+    helper_float_26_recursive(buf + 32, 5);
+    helper_float_26_recursive(buf + 64, 5);
+    helper_float_26_recursive(buf + 96, 5);
+    helper_float_26_recursive(buf + 128, 5);
+    helper_float_26_recursive(buf + 160, 5);
+    helper_float_26_recursive(buf + 192, 5);
+    helper_float_26_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_float_26_recursive(buf + 0, 8);
+    helper_float_26_recursive(buf + 256, 8);
+    helper_float_26_recursive(buf + 512, 8);
+    helper_float_26_recursive(buf + 768, 8);
+    helper_float_26_recursive(buf + 1024, 8);
+    helper_float_26_recursive(buf + 1280, 8);
+    helper_float_26_recursive(buf + 1536, 8);
+    helper_float_26_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_26_recursive(buf + 0, 11);
+    helper_float_26_recursive(buf + 2048, 11);
+    helper_float_26_recursive(buf + 4096, 11);
+    helper_float_26_recursive(buf + 6144, 11);
+    helper_float_26_recursive(buf + 8192, 11);
+    helper_float_26_recursive(buf + 10240, 11);
+    helper_float_26_recursive(buf + 12288, 11);
+    helper_float_26_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_26_recursive(buf + 0, 14);
+    helper_float_26_recursive(buf + 16384, 14);
+    helper_float_26_recursive(buf + 32768, 14);
+    helper_float_26_recursive(buf + 49152, 14);
+    helper_float_26_recursive(buf + 65536, 14);
+    helper_float_26_recursive(buf + 81920, 14);
+    helper_float_26_recursive(buf + 98304, 14);
+    helper_float_26_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_float_26_recursive(buf + 0, 17);
+    helper_float_26_recursive(buf + 131072, 17);
+    helper_float_26_recursive(buf + 262144, 17);
+    helper_float_26_recursive(buf + 393216, 17);
+    helper_float_26_recursive(buf + 524288, 17);
+    helper_float_26_recursive(buf + 655360, 17);
+    helper_float_26_recursive(buf + 786432, 17);
+    helper_float_26_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_float_26_recursive(buf + 0, 20);
+    helper_float_26_recursive(buf + 1048576, 20);
+    helper_float_26_recursive(buf + 2097152, 20);
+    helper_float_26_recursive(buf + 3145728, 20);
+    helper_float_26_recursive(buf + 4194304, 20);
+    helper_float_26_recursive(buf + 5242880, 20);
+    helper_float_26_recursive(buf + 6291456, 20);
+    helper_float_26_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_float_26_recursive(buf + 0, 23);
+    helper_float_26_recursive(buf + 8388608, 23);
+    helper_float_26_recursive(buf + 16777216, 23);
+    helper_float_26_recursive(buf + 25165824, 23);
+    helper_float_26_recursive(buf + 33554432, 23);
+    helper_float_26_recursive(buf + 41943040, 23);
+    helper_float_26_recursive(buf + 50331648, 23);
+    helper_float_26_recursive(buf + 58720256, 23);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_26(float *buf);
+void helper_float_26(float *buf) {
+  helper_float_26_recursive(buf, 26);
+}
+void helper_float_27_recursive(float *buf, int depth);
+void helper_float_27_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_27_recursive(buf + 0, 12);
+    helper_float_27_recursive(buf + 4096, 12);
+    helper_float_27_recursive(buf + 8192, 12);
+    helper_float_27_recursive(buf + 12288, 12);
+    helper_float_27_recursive(buf + 16384, 12);
+    helper_float_27_recursive(buf + 20480, 12);
+    helper_float_27_recursive(buf + 24576, 12);
+    helper_float_27_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_27_recursive(buf + 0, 15);
+    helper_float_27_recursive(buf + 32768, 15);
+    helper_float_27_recursive(buf + 65536, 15);
+    helper_float_27_recursive(buf + 98304, 15);
+    helper_float_27_recursive(buf + 131072, 15);
+    helper_float_27_recursive(buf + 163840, 15);
+    helper_float_27_recursive(buf + 196608, 15);
+    helper_float_27_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_27_recursive(buf + 0, 18);
+    helper_float_27_recursive(buf + 262144, 18);
+    helper_float_27_recursive(buf + 524288, 18);
+    helper_float_27_recursive(buf + 786432, 18);
+    helper_float_27_recursive(buf + 1048576, 18);
+    helper_float_27_recursive(buf + 1310720, 18);
+    helper_float_27_recursive(buf + 1572864, 18);
+    helper_float_27_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_27_recursive(buf + 0, 21);
+    helper_float_27_recursive(buf + 2097152, 21);
+    helper_float_27_recursive(buf + 4194304, 21);
+    helper_float_27_recursive(buf + 6291456, 21);
+    helper_float_27_recursive(buf + 8388608, 21);
+    helper_float_27_recursive(buf + 10485760, 21);
+    helper_float_27_recursive(buf + 12582912, 21);
+    helper_float_27_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_float_27_recursive(buf + 0, 24);
+    helper_float_27_recursive(buf + 16777216, 24);
+    helper_float_27_recursive(buf + 33554432, 24);
+    helper_float_27_recursive(buf + 50331648, 24);
+    helper_float_27_recursive(buf + 67108864, 24);
+    helper_float_27_recursive(buf + 83886080, 24);
+    helper_float_27_recursive(buf + 100663296, 24);
+    helper_float_27_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_27(float *buf);
+void helper_float_27(float *buf) {
+  helper_float_27_recursive(buf, 27);
+}
+void helper_float_28_recursive(float *buf, int depth);
+void helper_float_28_recursive(float *buf, int depth) {
+  if (depth == 16) {
+    for (int j = 0; j < 65536; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 65536; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 65536; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 65536; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_float_28_recursive(buf + 0, 16);
+    helper_float_28_recursive(buf + 65536, 16);
+    helper_float_28_recursive(buf + 131072, 16);
+    helper_float_28_recursive(buf + 196608, 16);
+    helper_float_28_recursive(buf + 262144, 16);
+    helper_float_28_recursive(buf + 327680, 16);
+    helper_float_28_recursive(buf + 393216, 16);
+    helper_float_28_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_float_28_recursive(buf + 0, 19);
+    helper_float_28_recursive(buf + 524288, 19);
+    helper_float_28_recursive(buf + 1048576, 19);
+    helper_float_28_recursive(buf + 1572864, 19);
+    helper_float_28_recursive(buf + 2097152, 19);
+    helper_float_28_recursive(buf + 2621440, 19);
+    helper_float_28_recursive(buf + 3145728, 19);
+    helper_float_28_recursive(buf + 3670016, 19);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 524288; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 25) {
+    helper_float_28_recursive(buf + 0, 22);
+    helper_float_28_recursive(buf + 4194304, 22);
+    helper_float_28_recursive(buf + 8388608, 22);
+    helper_float_28_recursive(buf + 12582912, 22);
+    helper_float_28_recursive(buf + 16777216, 22);
+    helper_float_28_recursive(buf + 20971520, 22);
+    helper_float_28_recursive(buf + 25165824, 22);
+    helper_float_28_recursive(buf + 29360128, 22);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 4194304; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 28) {
+    helper_float_28_recursive(buf + 0, 25);
+    helper_float_28_recursive(buf + 33554432, 25);
+    helper_float_28_recursive(buf + 67108864, 25);
+    helper_float_28_recursive(buf + 100663296, 25);
+    helper_float_28_recursive(buf + 134217728, 25);
+    helper_float_28_recursive(buf + 167772160, 25);
+    helper_float_28_recursive(buf + 201326592, 25);
+    helper_float_28_recursive(buf + 234881024, 25);
+    for (int j = 0; j < 268435456; j += 268435456) {
+      for (int k = 0; k < 33554432; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 33554432), "r"(buf + j + k + 67108864), "r"(buf + j + k + 100663296), "r"(buf + j + k + 134217728), "r"(buf + j + k + 167772160), "r"(buf + j + k + 201326592), "r"(buf + j + k + 234881024) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_28(float *buf);
+void helper_float_28(float *buf) {
+  helper_float_28_recursive(buf, 28);
+}
+void helper_float_29_recursive(float *buf, int depth);
+void helper_float_29_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_29_recursive(buf + 0, 12);
+    helper_float_29_recursive(buf + 4096, 12);
+    helper_float_29_recursive(buf + 8192, 12);
+    helper_float_29_recursive(buf + 12288, 12);
+    helper_float_29_recursive(buf + 16384, 12);
+    helper_float_29_recursive(buf + 20480, 12);
+    helper_float_29_recursive(buf + 24576, 12);
+    helper_float_29_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_29_recursive(buf + 0, 15);
+    helper_float_29_recursive(buf + 32768, 15);
+    helper_float_29_recursive(buf + 65536, 15);
+    helper_float_29_recursive(buf + 98304, 15);
+    helper_float_29_recursive(buf + 131072, 15);
+    helper_float_29_recursive(buf + 163840, 15);
+    helper_float_29_recursive(buf + 196608, 15);
+    helper_float_29_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_29_recursive(buf + 0, 18);
+    helper_float_29_recursive(buf + 262144, 18);
+    helper_float_29_recursive(buf + 524288, 18);
+    helper_float_29_recursive(buf + 786432, 18);
+    helper_float_29_recursive(buf + 1048576, 18);
+    helper_float_29_recursive(buf + 1310720, 18);
+    helper_float_29_recursive(buf + 1572864, 18);
+    helper_float_29_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_29_recursive(buf + 0, 21);
+    helper_float_29_recursive(buf + 2097152, 21);
+    helper_float_29_recursive(buf + 4194304, 21);
+    helper_float_29_recursive(buf + 6291456, 21);
+    helper_float_29_recursive(buf + 8388608, 21);
+    helper_float_29_recursive(buf + 10485760, 21);
+    helper_float_29_recursive(buf + 12582912, 21);
+    helper_float_29_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_float_29_recursive(buf + 0, 24);
+    helper_float_29_recursive(buf + 16777216, 24);
+    helper_float_29_recursive(buf + 33554432, 24);
+    helper_float_29_recursive(buf + 50331648, 24);
+    helper_float_29_recursive(buf + 67108864, 24);
+    helper_float_29_recursive(buf + 83886080, 24);
+    helper_float_29_recursive(buf + 100663296, 24);
+    helper_float_29_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 29) {
+    helper_float_29_recursive(buf + 0, 27);
+    helper_float_29_recursive(buf + 134217728, 27);
+    helper_float_29_recursive(buf + 268435456, 27);
+    helper_float_29_recursive(buf + 402653184, 27);
+    for (int j = 0; j < 536870912; j += 536870912) {
+      for (int k = 0; k < 134217728; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_29(float *buf);
+void helper_float_29(float *buf) {
+  helper_float_29_recursive(buf, 29);
+}
+void helper_float_30_recursive(float *buf, int depth);
+void helper_float_30_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_30_recursive(buf + 0, 12);
+    helper_float_30_recursive(buf + 4096, 12);
+    helper_float_30_recursive(buf + 8192, 12);
+    helper_float_30_recursive(buf + 12288, 12);
+    helper_float_30_recursive(buf + 16384, 12);
+    helper_float_30_recursive(buf + 20480, 12);
+    helper_float_30_recursive(buf + 24576, 12);
+    helper_float_30_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_30_recursive(buf + 0, 15);
+    helper_float_30_recursive(buf + 32768, 15);
+    helper_float_30_recursive(buf + 65536, 15);
+    helper_float_30_recursive(buf + 98304, 15);
+    helper_float_30_recursive(buf + 131072, 15);
+    helper_float_30_recursive(buf + 163840, 15);
+    helper_float_30_recursive(buf + 196608, 15);
+    helper_float_30_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_30_recursive(buf + 0, 18);
+    helper_float_30_recursive(buf + 262144, 18);
+    helper_float_30_recursive(buf + 524288, 18);
+    helper_float_30_recursive(buf + 786432, 18);
+    helper_float_30_recursive(buf + 1048576, 18);
+    helper_float_30_recursive(buf + 1310720, 18);
+    helper_float_30_recursive(buf + 1572864, 18);
+    helper_float_30_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_30_recursive(buf + 0, 21);
+    helper_float_30_recursive(buf + 2097152, 21);
+    helper_float_30_recursive(buf + 4194304, 21);
+    helper_float_30_recursive(buf + 6291456, 21);
+    helper_float_30_recursive(buf + 8388608, 21);
+    helper_float_30_recursive(buf + 10485760, 21);
+    helper_float_30_recursive(buf + 12582912, 21);
+    helper_float_30_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_float_30_recursive(buf + 0, 24);
+    helper_float_30_recursive(buf + 16777216, 24);
+    helper_float_30_recursive(buf + 33554432, 24);
+    helper_float_30_recursive(buf + 50331648, 24);
+    helper_float_30_recursive(buf + 67108864, 24);
+    helper_float_30_recursive(buf + 83886080, 24);
+    helper_float_30_recursive(buf + 100663296, 24);
+    helper_float_30_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 30) {
+    helper_float_30_recursive(buf + 0, 27);
+    helper_float_30_recursive(buf + 134217728, 27);
+    helper_float_30_recursive(buf + 268435456, 27);
+    helper_float_30_recursive(buf + 402653184, 27);
+    helper_float_30_recursive(buf + 536870912, 27);
+    helper_float_30_recursive(buf + 671088640, 27);
+    helper_float_30_recursive(buf + 805306368, 27);
+    helper_float_30_recursive(buf + 939524096, 27);
+    for (int j = 0; j < 1073741824; j += 1073741824) {
+      for (int k = 0; k < 134217728; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_30(float *buf);
+void helper_float_30(float *buf) {
+  helper_float_30_recursive(buf, 30);
+}
+int fht_float(float *buf, int log_n) {
+  if (log_n == 0) {
+    return 0;
+  }
+  if (log_n == 1) {
+    helper_float_1(buf);
+    return 0;
+  }
+  if (log_n == 2) {
+    helper_float_2(buf);
+    return 0;
+  }
+  if (log_n == 3) {
+    helper_float_3(buf);
+    return 0;
+  }
+  if (log_n == 4) {
+    helper_float_4(buf);
+    return 0;
+  }
+  if (log_n == 5) {
+    helper_float_5(buf);
+    return 0;
+  }
+  if (log_n == 6) {
+    helper_float_6(buf);
+    return 0;
+  }
+  if (log_n == 7) {
+    helper_float_7(buf);
+    return 0;
+  }
+  if (log_n == 8) {
+    helper_float_8(buf);
+    return 0;
+  }
+  if (log_n == 9) {
+    helper_float_9(buf);
+    return 0;
+  }
+  if (log_n == 10) {
+    helper_float_10(buf);
+    return 0;
+  }
+  if (log_n == 11) {
+    helper_float_11(buf);
+    return 0;
+  }
+  if (log_n == 12) {
+    helper_float_12(buf);
+    return 0;
+  }
+  if (log_n == 13) {
+    helper_float_13(buf);
+    return 0;
+  }
+  if (log_n == 14) {
+    helper_float_14(buf);
+    return 0;
+  }
+  if (log_n == 15) {
+    helper_float_15(buf);
+    return 0;
+  }
+  if (log_n == 16) {
+    helper_float_16(buf);
+    return 0;
+  }
+  if (log_n == 17) {
+    helper_float_17(buf);
+    return 0;
+  }
+  if (log_n == 18) {
+    helper_float_18(buf);
+    return 0;
+  }
+  if (log_n == 19) {
+    helper_float_19(buf);
+    return 0;
+  }
+  if (log_n == 20) {
+    helper_float_20(buf);
+    return 0;
+  }
+  if (log_n == 21) {
+    helper_float_21(buf);
+    return 0;
+  }
+  if (log_n == 22) {
+    helper_float_22(buf);
+    return 0;
+  }
+  if (log_n == 23) {
+    helper_float_23(buf);
+    return 0;
+  }
+  if (log_n == 24) {
+    helper_float_24(buf);
+    return 0;
+  }
+  if (log_n == 25) {
+    helper_float_25(buf);
+    return 0;
+  }
+  if (log_n == 26) {
+    helper_float_26(buf);
+    return 0;
+  }
+  if (log_n == 27) {
+    helper_float_27(buf);
+    return 0;
+  }
+  if (log_n == 28) {
+    helper_float_28(buf);
+    return 0;
+  }
+  if (log_n == 29) {
+    helper_float_29(buf);
+    return 0;
+  }
+  if (log_n == 30) {
+    helper_float_30(buf);
+    return 0;
+  }
+  return 1;
+}
+static inline void helper_double_1(double *buf);
+static inline void helper_double_1(double *buf) {
+  for (int j = 0; j < 2; j += 2) {
+    __asm__ volatile (
+      "movupd (%0), %%xmm0\n"
+      "movapd %%xmm0, %%xmm8\n"
+      "haddpd %%xmm8, %%xmm8\n"
+      "movapd %%xmm0, %%xmm9\n"
+      "hsubpd %%xmm9, %%xmm9\n"
+      "blendpd $1, %%xmm8, %%xmm9\n"
+      "movapd %%xmm9, %%xmm0\n"
+      "movupd %%xmm0, (%0)\n"
+      :: "r"(buf + j) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+    );
+  }
+}
+void helper_double_2_recursive(double *buf, int depth);
+void helper_double_2_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_2(double *buf);
+void helper_double_2(double *buf) {
+  helper_double_2_recursive(buf, 2);
+}
+void helper_double_3_recursive(double *buf, int depth);
+void helper_double_3_recursive(double *buf, int depth) {
+  if (depth == 3) {
+    for (int j = 0; j < 8; j += 8) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_3(double *buf);
+void helper_double_3(double *buf) {
+  helper_double_3_recursive(buf, 3);
+}
+static inline void helper_double_4(double *buf);
+static inline void helper_double_4(double *buf) {
+  for (int j = 0; j < 16; j += 16) {
+    for (int k = 0; k < 2; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm0\n"
+        "movapd %%xmm1, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm2, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm2, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm2\n"
+        "movapd %%xmm3, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm3, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "movapd %%xmm4, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm4, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm4\n"
+        "movapd %%xmm5, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm5, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm5\n"
+        "movapd %%xmm6, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm6, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm6\n"
+        "movapd %%xmm7, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm7, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+void helper_double_5_recursive(double *buf, int depth);
+void helper_double_5_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_5_recursive(buf + 0, 2);
+    helper_double_5_recursive(buf + 4, 2);
+    helper_double_5_recursive(buf + 8, 2);
+    helper_double_5_recursive(buf + 12, 2);
+    helper_double_5_recursive(buf + 16, 2);
+    helper_double_5_recursive(buf + 20, 2);
+    helper_double_5_recursive(buf + 24, 2);
+    helper_double_5_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_5(double *buf);
+void helper_double_5(double *buf) {
+  helper_double_5_recursive(buf, 5);
+}
+static inline void helper_double_6(double *buf);
+static inline void helper_double_6(double *buf) {
+  for (int j = 0; j < 64; j += 16) {
+    for (int k = 0; k < 2; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm0\n"
+        "movapd %%xmm1, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm2, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm2, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm2\n"
+        "movapd %%xmm3, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm3, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "movapd %%xmm4, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm4, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm4\n"
+        "movapd %%xmm5, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm5, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm5\n"
+        "movapd %%xmm6, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm6, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm6\n"
+        "movapd %%xmm7, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm7, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 64; j += 64) {
+    for (int k = 0; k < 16; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movupd %%xmm0, (%0)\n"
+        "movupd %%xmm1, (%1)\n"
+        "movupd %%xmm2, (%2)\n"
+        "movupd %%xmm3, (%3)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_7(double *buf);
+static inline void helper_double_7(double *buf) {
+  for (int j = 0; j < 128; j += 16) {
+    for (int k = 0; k < 2; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm0\n"
+        "movapd %%xmm1, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm2, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm2, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm2\n"
+        "movapd %%xmm3, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm3, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "movapd %%xmm4, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm4, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm4\n"
+        "movapd %%xmm5, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm5, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm5\n"
+        "movapd %%xmm6, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm6, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm6\n"
+        "movapd %%xmm7, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm7, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 128; j += 128) {
+    for (int k = 0; k < 16; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+void helper_double_8_recursive(double *buf, int depth);
+void helper_double_8_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_8_recursive(buf + 0, 2);
+    helper_double_8_recursive(buf + 4, 2);
+    helper_double_8_recursive(buf + 8, 2);
+    helper_double_8_recursive(buf + 12, 2);
+    helper_double_8_recursive(buf + 16, 2);
+    helper_double_8_recursive(buf + 20, 2);
+    helper_double_8_recursive(buf + 24, 2);
+    helper_double_8_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_8_recursive(buf + 0, 5);
+    helper_double_8_recursive(buf + 32, 5);
+    helper_double_8_recursive(buf + 64, 5);
+    helper_double_8_recursive(buf + 96, 5);
+    helper_double_8_recursive(buf + 128, 5);
+    helper_double_8_recursive(buf + 160, 5);
+    helper_double_8_recursive(buf + 192, 5);
+    helper_double_8_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_8(double *buf);
+void helper_double_8(double *buf) {
+  helper_double_8_recursive(buf, 8);
+}
+void helper_double_9_recursive(double *buf, int depth);
+void helper_double_9_recursive(double *buf, int depth) {
+  if (depth == 6) {
+    for (int j = 0; j < 64; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 9) {
+    helper_double_9_recursive(buf + 0, 6);
+    helper_double_9_recursive(buf + 64, 6);
+    helper_double_9_recursive(buf + 128, 6);
+    helper_double_9_recursive(buf + 192, 6);
+    helper_double_9_recursive(buf + 256, 6);
+    helper_double_9_recursive(buf + 320, 6);
+    helper_double_9_recursive(buf + 384, 6);
+    helper_double_9_recursive(buf + 448, 6);
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_9(double *buf);
+void helper_double_9(double *buf) {
+  helper_double_9_recursive(buf, 9);
+}
+void helper_double_10_recursive(double *buf, int depth);
+void helper_double_10_recursive(double *buf, int depth) {
+  if (depth == 10) {
+    for (int j = 0; j < 1024; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_10(double *buf);
+void helper_double_10(double *buf) {
+  helper_double_10_recursive(buf, 10);
+}
+void helper_double_11_recursive(double *buf, int depth);
+void helper_double_11_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_11_recursive(buf + 0, 2);
+    helper_double_11_recursive(buf + 4, 2);
+    helper_double_11_recursive(buf + 8, 2);
+    helper_double_11_recursive(buf + 12, 2);
+    helper_double_11_recursive(buf + 16, 2);
+    helper_double_11_recursive(buf + 20, 2);
+    helper_double_11_recursive(buf + 24, 2);
+    helper_double_11_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_11_recursive(buf + 0, 5);
+    helper_double_11_recursive(buf + 32, 5);
+    helper_double_11_recursive(buf + 64, 5);
+    helper_double_11_recursive(buf + 96, 5);
+    helper_double_11_recursive(buf + 128, 5);
+    helper_double_11_recursive(buf + 160, 5);
+    helper_double_11_recursive(buf + 192, 5);
+    helper_double_11_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_11_recursive(buf + 0, 8);
+    helper_double_11_recursive(buf + 256, 8);
+    helper_double_11_recursive(buf + 512, 8);
+    helper_double_11_recursive(buf + 768, 8);
+    helper_double_11_recursive(buf + 1024, 8);
+    helper_double_11_recursive(buf + 1280, 8);
+    helper_double_11_recursive(buf + 1536, 8);
+    helper_double_11_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_11(double *buf);
+void helper_double_11(double *buf) {
+  helper_double_11_recursive(buf, 11);
+}
+void helper_double_12_recursive(double *buf, int depth);
+void helper_double_12_recursive(double *buf, int depth) {
+  if (depth == 10) {
+    for (int j = 0; j < 1024; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_12_recursive(buf + 0, 10);
+    helper_double_12_recursive(buf + 1024, 10);
+    helper_double_12_recursive(buf + 2048, 10);
+    helper_double_12_recursive(buf + 3072, 10);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_12(double *buf);
+void helper_double_12(double *buf) {
+  helper_double_12_recursive(buf, 12);
+}
+static inline void helper_double_13(double *buf);
+static inline void helper_double_13(double *buf) {
+  for (int j = 0; j < 8192; j += 16) {
+    for (int k = 0; k < 2; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm0\n"
+        "movapd %%xmm1, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm2, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm2, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm2\n"
+        "movapd %%xmm3, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm3, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "movapd %%xmm4, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm4, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm4\n"
+        "movapd %%xmm5, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm5, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm5\n"
+        "movapd %%xmm6, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm6, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm6\n"
+        "movapd %%xmm7, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm7, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 8192; j += 128) {
+    for (int k = 0; k < 16; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 8192; j += 1024) {
+    for (int k = 0; k < 128; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 8192; j += 8192) {
+    for (int k = 0; k < 1024; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+void helper_double_14_recursive(double *buf, int depth);
+void helper_double_14_recursive(double *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_14_recursive(buf + 0, 9);
+    helper_double_14_recursive(buf + 512, 9);
+    helper_double_14_recursive(buf + 1024, 9);
+    helper_double_14_recursive(buf + 1536, 9);
+    helper_double_14_recursive(buf + 2048, 9);
+    helper_double_14_recursive(buf + 2560, 9);
+    helper_double_14_recursive(buf + 3072, 9);
+    helper_double_14_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_14_recursive(buf + 0, 12);
+    helper_double_14_recursive(buf + 4096, 12);
+    helper_double_14_recursive(buf + 8192, 12);
+    helper_double_14_recursive(buf + 12288, 12);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_14(double *buf);
+void helper_double_14(double *buf) {
+  helper_double_14_recursive(buf, 14);
+}
+void helper_double_15_recursive(double *buf, int depth);
+void helper_double_15_recursive(double *buf, int depth) {
+  if (depth == 10) {
+    for (int j = 0; j < 1024; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_double_15_recursive(buf + 0, 10);
+    helper_double_15_recursive(buf + 1024, 10);
+    helper_double_15_recursive(buf + 2048, 10);
+    helper_double_15_recursive(buf + 3072, 10);
+    helper_double_15_recursive(buf + 4096, 10);
+    helper_double_15_recursive(buf + 5120, 10);
+    helper_double_15_recursive(buf + 6144, 10);
+    helper_double_15_recursive(buf + 7168, 10);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_15_recursive(buf + 0, 13);
+    helper_double_15_recursive(buf + 8192, 13);
+    helper_double_15_recursive(buf + 16384, 13);
+    helper_double_15_recursive(buf + 24576, 13);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 8192; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_15(double *buf);
+void helper_double_15(double *buf) {
+  helper_double_15_recursive(buf, 15);
+}
+void helper_double_16_recursive(double *buf, int depth);
+void helper_double_16_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_16_recursive(buf + 0, 2);
+    helper_double_16_recursive(buf + 4, 2);
+    helper_double_16_recursive(buf + 8, 2);
+    helper_double_16_recursive(buf + 12, 2);
+    helper_double_16_recursive(buf + 16, 2);
+    helper_double_16_recursive(buf + 20, 2);
+    helper_double_16_recursive(buf + 24, 2);
+    helper_double_16_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_16_recursive(buf + 0, 5);
+    helper_double_16_recursive(buf + 32, 5);
+    helper_double_16_recursive(buf + 64, 5);
+    helper_double_16_recursive(buf + 96, 5);
+    helper_double_16_recursive(buf + 128, 5);
+    helper_double_16_recursive(buf + 160, 5);
+    helper_double_16_recursive(buf + 192, 5);
+    helper_double_16_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_16_recursive(buf + 0, 8);
+    helper_double_16_recursive(buf + 256, 8);
+    helper_double_16_recursive(buf + 512, 8);
+    helper_double_16_recursive(buf + 768, 8);
+    helper_double_16_recursive(buf + 1024, 8);
+    helper_double_16_recursive(buf + 1280, 8);
+    helper_double_16_recursive(buf + 1536, 8);
+    helper_double_16_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_16_recursive(buf + 0, 11);
+    helper_double_16_recursive(buf + 2048, 11);
+    helper_double_16_recursive(buf + 4096, 11);
+    helper_double_16_recursive(buf + 6144, 11);
+    helper_double_16_recursive(buf + 8192, 11);
+    helper_double_16_recursive(buf + 10240, 11);
+    helper_double_16_recursive(buf + 12288, 11);
+    helper_double_16_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_16_recursive(buf + 0, 14);
+    helper_double_16_recursive(buf + 16384, 14);
+    helper_double_16_recursive(buf + 32768, 14);
+    helper_double_16_recursive(buf + 49152, 14);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 16384; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_16(double *buf);
+void helper_double_16(double *buf) {
+  helper_double_16_recursive(buf, 16);
+}
+void helper_double_17_recursive(double *buf, int depth);
+void helper_double_17_recursive(double *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_17_recursive(buf + 0, 12);
+    helper_double_17_recursive(buf + 4096, 12);
+    helper_double_17_recursive(buf + 8192, 12);
+    helper_double_17_recursive(buf + 12288, 12);
+    helper_double_17_recursive(buf + 16384, 12);
+    helper_double_17_recursive(buf + 20480, 12);
+    helper_double_17_recursive(buf + 24576, 12);
+    helper_double_17_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_17_recursive(buf + 0, 15);
+    helper_double_17_recursive(buf + 32768, 15);
+    helper_double_17_recursive(buf + 65536, 15);
+    helper_double_17_recursive(buf + 98304, 15);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 32768; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_17(double *buf);
+void helper_double_17(double *buf) {
+  helper_double_17_recursive(buf, 17);
+}
+void helper_double_18_recursive(double *buf, int depth);
+void helper_double_18_recursive(double *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_18_recursive(buf + 0, 12);
+    helper_double_18_recursive(buf + 4096, 12);
+    helper_double_18_recursive(buf + 8192, 12);
+    helper_double_18_recursive(buf + 12288, 12);
+    helper_double_18_recursive(buf + 16384, 12);
+    helper_double_18_recursive(buf + 20480, 12);
+    helper_double_18_recursive(buf + 24576, 12);
+    helper_double_18_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_18_recursive(buf + 0, 15);
+    helper_double_18_recursive(buf + 32768, 15);
+    helper_double_18_recursive(buf + 65536, 15);
+    helper_double_18_recursive(buf + 98304, 15);
+    helper_double_18_recursive(buf + 131072, 15);
+    helper_double_18_recursive(buf + 163840, 15);
+    helper_double_18_recursive(buf + 196608, 15);
+    helper_double_18_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_18(double *buf);
+void helper_double_18(double *buf) {
+  helper_double_18_recursive(buf, 18);
+}
+void helper_double_19_recursive(double *buf, int depth);
+void helper_double_19_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_19_recursive(buf + 0, 2);
+    helper_double_19_recursive(buf + 4, 2);
+    helper_double_19_recursive(buf + 8, 2);
+    helper_double_19_recursive(buf + 12, 2);
+    helper_double_19_recursive(buf + 16, 2);
+    helper_double_19_recursive(buf + 20, 2);
+    helper_double_19_recursive(buf + 24, 2);
+    helper_double_19_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_19_recursive(buf + 0, 5);
+    helper_double_19_recursive(buf + 32, 5);
+    helper_double_19_recursive(buf + 64, 5);
+    helper_double_19_recursive(buf + 96, 5);
+    helper_double_19_recursive(buf + 128, 5);
+    helper_double_19_recursive(buf + 160, 5);
+    helper_double_19_recursive(buf + 192, 5);
+    helper_double_19_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_19_recursive(buf + 0, 8);
+    helper_double_19_recursive(buf + 256, 8);
+    helper_double_19_recursive(buf + 512, 8);
+    helper_double_19_recursive(buf + 768, 8);
+    helper_double_19_recursive(buf + 1024, 8);
+    helper_double_19_recursive(buf + 1280, 8);
+    helper_double_19_recursive(buf + 1536, 8);
+    helper_double_19_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_19_recursive(buf + 0, 11);
+    helper_double_19_recursive(buf + 2048, 11);
+    helper_double_19_recursive(buf + 4096, 11);
+    helper_double_19_recursive(buf + 6144, 11);
+    helper_double_19_recursive(buf + 8192, 11);
+    helper_double_19_recursive(buf + 10240, 11);
+    helper_double_19_recursive(buf + 12288, 11);
+    helper_double_19_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_19_recursive(buf + 0, 14);
+    helper_double_19_recursive(buf + 16384, 14);
+    helper_double_19_recursive(buf + 32768, 14);
+    helper_double_19_recursive(buf + 49152, 14);
+    helper_double_19_recursive(buf + 65536, 14);
+    helper_double_19_recursive(buf + 81920, 14);
+    helper_double_19_recursive(buf + 98304, 14);
+    helper_double_19_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_19_recursive(buf + 0, 17);
+    helper_double_19_recursive(buf + 131072, 17);
+    helper_double_19_recursive(buf + 262144, 17);
+    helper_double_19_recursive(buf + 393216, 17);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 131072; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_19(double *buf);
+void helper_double_19(double *buf) {
+  helper_double_19_recursive(buf, 19);
+}
+void helper_double_20_recursive(double *buf, int depth);
+void helper_double_20_recursive(double *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_20_recursive(buf + 0, 12);
+    helper_double_20_recursive(buf + 4096, 12);
+    helper_double_20_recursive(buf + 8192, 12);
+    helper_double_20_recursive(buf + 12288, 12);
+    helper_double_20_recursive(buf + 16384, 12);
+    helper_double_20_recursive(buf + 20480, 12);
+    helper_double_20_recursive(buf + 24576, 12);
+    helper_double_20_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_20_recursive(buf + 0, 15);
+    helper_double_20_recursive(buf + 32768, 15);
+    helper_double_20_recursive(buf + 65536, 15);
+    helper_double_20_recursive(buf + 98304, 15);
+    helper_double_20_recursive(buf + 131072, 15);
+    helper_double_20_recursive(buf + 163840, 15);
+    helper_double_20_recursive(buf + 196608, 15);
+    helper_double_20_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_20_recursive(buf + 0, 18);
+    helper_double_20_recursive(buf + 262144, 18);
+    helper_double_20_recursive(buf + 524288, 18);
+    helper_double_20_recursive(buf + 786432, 18);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 262144; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_20(double *buf);
+void helper_double_20(double *buf) {
+  helper_double_20_recursive(buf, 20);
+}
+void helper_double_21_recursive(double *buf, int depth);
+void helper_double_21_recursive(double *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_21_recursive(buf + 0, 13);
+    helper_double_21_recursive(buf + 8192, 13);
+    helper_double_21_recursive(buf + 16384, 13);
+    helper_double_21_recursive(buf + 24576, 13);
+    helper_double_21_recursive(buf + 32768, 13);
+    helper_double_21_recursive(buf + 40960, 13);
+    helper_double_21_recursive(buf + 49152, 13);
+    helper_double_21_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_21_recursive(buf + 0, 16);
+    helper_double_21_recursive(buf + 65536, 16);
+    helper_double_21_recursive(buf + 131072, 16);
+    helper_double_21_recursive(buf + 196608, 16);
+    helper_double_21_recursive(buf + 262144, 16);
+    helper_double_21_recursive(buf + 327680, 16);
+    helper_double_21_recursive(buf + 393216, 16);
+    helper_double_21_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_21_recursive(buf + 0, 19);
+    helper_double_21_recursive(buf + 524288, 19);
+    helper_double_21_recursive(buf + 1048576, 19);
+    helper_double_21_recursive(buf + 1572864, 19);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 524288; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_21(double *buf);
+void helper_double_21(double *buf) {
+  helper_double_21_recursive(buf, 21);
+}
+void helper_double_22_recursive(double *buf, int depth);
+void helper_double_22_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_22_recursive(buf + 0, 2);
+    helper_double_22_recursive(buf + 4, 2);
+    helper_double_22_recursive(buf + 8, 2);
+    helper_double_22_recursive(buf + 12, 2);
+    helper_double_22_recursive(buf + 16, 2);
+    helper_double_22_recursive(buf + 20, 2);
+    helper_double_22_recursive(buf + 24, 2);
+    helper_double_22_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_22_recursive(buf + 0, 5);
+    helper_double_22_recursive(buf + 32, 5);
+    helper_double_22_recursive(buf + 64, 5);
+    helper_double_22_recursive(buf + 96, 5);
+    helper_double_22_recursive(buf + 128, 5);
+    helper_double_22_recursive(buf + 160, 5);
+    helper_double_22_recursive(buf + 192, 5);
+    helper_double_22_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_22_recursive(buf + 0, 8);
+    helper_double_22_recursive(buf + 256, 8);
+    helper_double_22_recursive(buf + 512, 8);
+    helper_double_22_recursive(buf + 768, 8);
+    helper_double_22_recursive(buf + 1024, 8);
+    helper_double_22_recursive(buf + 1280, 8);
+    helper_double_22_recursive(buf + 1536, 8);
+    helper_double_22_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_22_recursive(buf + 0, 11);
+    helper_double_22_recursive(buf + 2048, 11);
+    helper_double_22_recursive(buf + 4096, 11);
+    helper_double_22_recursive(buf + 6144, 11);
+    helper_double_22_recursive(buf + 8192, 11);
+    helper_double_22_recursive(buf + 10240, 11);
+    helper_double_22_recursive(buf + 12288, 11);
+    helper_double_22_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_22_recursive(buf + 0, 14);
+    helper_double_22_recursive(buf + 16384, 14);
+    helper_double_22_recursive(buf + 32768, 14);
+    helper_double_22_recursive(buf + 49152, 14);
+    helper_double_22_recursive(buf + 65536, 14);
+    helper_double_22_recursive(buf + 81920, 14);
+    helper_double_22_recursive(buf + 98304, 14);
+    helper_double_22_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_22_recursive(buf + 0, 17);
+    helper_double_22_recursive(buf + 131072, 17);
+    helper_double_22_recursive(buf + 262144, 17);
+    helper_double_22_recursive(buf + 393216, 17);
+    helper_double_22_recursive(buf + 524288, 17);
+    helper_double_22_recursive(buf + 655360, 17);
+    helper_double_22_recursive(buf + 786432, 17);
+    helper_double_22_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_double_22_recursive(buf + 0, 20);
+    helper_double_22_recursive(buf + 1048576, 20);
+    helper_double_22_recursive(buf + 2097152, 20);
+    helper_double_22_recursive(buf + 3145728, 20);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 1048576; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_22(double *buf);
+void helper_double_22(double *buf) {
+  helper_double_22_recursive(buf, 22);
+}
+void helper_double_23_recursive(double *buf, int depth);
+void helper_double_23_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_23_recursive(buf + 0, 2);
+    helper_double_23_recursive(buf + 4, 2);
+    helper_double_23_recursive(buf + 8, 2);
+    helper_double_23_recursive(buf + 12, 2);
+    helper_double_23_recursive(buf + 16, 2);
+    helper_double_23_recursive(buf + 20, 2);
+    helper_double_23_recursive(buf + 24, 2);
+    helper_double_23_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_23_recursive(buf + 0, 5);
+    helper_double_23_recursive(buf + 32, 5);
+    helper_double_23_recursive(buf + 64, 5);
+    helper_double_23_recursive(buf + 96, 5);
+    helper_double_23_recursive(buf + 128, 5);
+    helper_double_23_recursive(buf + 160, 5);
+    helper_double_23_recursive(buf + 192, 5);
+    helper_double_23_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_23_recursive(buf + 0, 8);
+    helper_double_23_recursive(buf + 256, 8);
+    helper_double_23_recursive(buf + 512, 8);
+    helper_double_23_recursive(buf + 768, 8);
+    helper_double_23_recursive(buf + 1024, 8);
+    helper_double_23_recursive(buf + 1280, 8);
+    helper_double_23_recursive(buf + 1536, 8);
+    helper_double_23_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_23_recursive(buf + 0, 11);
+    helper_double_23_recursive(buf + 2048, 11);
+    helper_double_23_recursive(buf + 4096, 11);
+    helper_double_23_recursive(buf + 6144, 11);
+    helper_double_23_recursive(buf + 8192, 11);
+    helper_double_23_recursive(buf + 10240, 11);
+    helper_double_23_recursive(buf + 12288, 11);
+    helper_double_23_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_23_recursive(buf + 0, 14);
+    helper_double_23_recursive(buf + 16384, 14);
+    helper_double_23_recursive(buf + 32768, 14);
+    helper_double_23_recursive(buf + 49152, 14);
+    helper_double_23_recursive(buf + 65536, 14);
+    helper_double_23_recursive(buf + 81920, 14);
+    helper_double_23_recursive(buf + 98304, 14);
+    helper_double_23_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_23_recursive(buf + 0, 17);
+    helper_double_23_recursive(buf + 131072, 17);
+    helper_double_23_recursive(buf + 262144, 17);
+    helper_double_23_recursive(buf + 393216, 17);
+    helper_double_23_recursive(buf + 524288, 17);
+    helper_double_23_recursive(buf + 655360, 17);
+    helper_double_23_recursive(buf + 786432, 17);
+    helper_double_23_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_23_recursive(buf + 0, 20);
+    helper_double_23_recursive(buf + 1048576, 20);
+    helper_double_23_recursive(buf + 2097152, 20);
+    helper_double_23_recursive(buf + 3145728, 20);
+    helper_double_23_recursive(buf + 4194304, 20);
+    helper_double_23_recursive(buf + 5242880, 20);
+    helper_double_23_recursive(buf + 6291456, 20);
+    helper_double_23_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_23(double *buf);
+void helper_double_23(double *buf) {
+  helper_double_23_recursive(buf, 23);
+}
+void helper_double_24_recursive(double *buf, int depth);
+void helper_double_24_recursive(double *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_24_recursive(buf + 0, 13);
+    helper_double_24_recursive(buf + 8192, 13);
+    helper_double_24_recursive(buf + 16384, 13);
+    helper_double_24_recursive(buf + 24576, 13);
+    helper_double_24_recursive(buf + 32768, 13);
+    helper_double_24_recursive(buf + 40960, 13);
+    helper_double_24_recursive(buf + 49152, 13);
+    helper_double_24_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_24_recursive(buf + 0, 16);
+    helper_double_24_recursive(buf + 65536, 16);
+    helper_double_24_recursive(buf + 131072, 16);
+    helper_double_24_recursive(buf + 196608, 16);
+    helper_double_24_recursive(buf + 262144, 16);
+    helper_double_24_recursive(buf + 327680, 16);
+    helper_double_24_recursive(buf + 393216, 16);
+    helper_double_24_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_double_24_recursive(buf + 0, 19);
+    helper_double_24_recursive(buf + 524288, 19);
+    helper_double_24_recursive(buf + 1048576, 19);
+    helper_double_24_recursive(buf + 1572864, 19);
+    helper_double_24_recursive(buf + 2097152, 19);
+    helper_double_24_recursive(buf + 2621440, 19);
+    helper_double_24_recursive(buf + 3145728, 19);
+    helper_double_24_recursive(buf + 3670016, 19);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 524288; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_24_recursive(buf + 0, 22);
+    helper_double_24_recursive(buf + 4194304, 22);
+    helper_double_24_recursive(buf + 8388608, 22);
+    helper_double_24_recursive(buf + 12582912, 22);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 4194304; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_24(double *buf);
+void helper_double_24(double *buf) {
+  helper_double_24_recursive(buf, 24);
+}
+void helper_double_25_recursive(double *buf, int depth);
+void helper_double_25_recursive(double *buf, int depth) {
+  if (depth == 10) {
+    for (int j = 0; j < 1024; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_double_25_recursive(buf + 0, 10);
+    helper_double_25_recursive(buf + 1024, 10);
+    helper_double_25_recursive(buf + 2048, 10);
+    helper_double_25_recursive(buf + 3072, 10);
+    helper_double_25_recursive(buf + 4096, 10);
+    helper_double_25_recursive(buf + 5120, 10);
+    helper_double_25_recursive(buf + 6144, 10);
+    helper_double_25_recursive(buf + 7168, 10);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_25_recursive(buf + 0, 13);
+    helper_double_25_recursive(buf + 8192, 13);
+    helper_double_25_recursive(buf + 16384, 13);
+    helper_double_25_recursive(buf + 24576, 13);
+    helper_double_25_recursive(buf + 32768, 13);
+    helper_double_25_recursive(buf + 40960, 13);
+    helper_double_25_recursive(buf + 49152, 13);
+    helper_double_25_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_25_recursive(buf + 0, 16);
+    helper_double_25_recursive(buf + 65536, 16);
+    helper_double_25_recursive(buf + 131072, 16);
+    helper_double_25_recursive(buf + 196608, 16);
+    helper_double_25_recursive(buf + 262144, 16);
+    helper_double_25_recursive(buf + 327680, 16);
+    helper_double_25_recursive(buf + 393216, 16);
+    helper_double_25_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_double_25_recursive(buf + 0, 19);
+    helper_double_25_recursive(buf + 524288, 19);
+    helper_double_25_recursive(buf + 1048576, 19);
+    helper_double_25_recursive(buf + 1572864, 19);
+    helper_double_25_recursive(buf + 2097152, 19);
+    helper_double_25_recursive(buf + 2621440, 19);
+    helper_double_25_recursive(buf + 3145728, 19);
+    helper_double_25_recursive(buf + 3670016, 19);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 524288; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 25) {
+    helper_double_25_recursive(buf + 0, 22);
+    helper_double_25_recursive(buf + 4194304, 22);
+    helper_double_25_recursive(buf + 8388608, 22);
+    helper_double_25_recursive(buf + 12582912, 22);
+    helper_double_25_recursive(buf + 16777216, 22);
+    helper_double_25_recursive(buf + 20971520, 22);
+    helper_double_25_recursive(buf + 25165824, 22);
+    helper_double_25_recursive(buf + 29360128, 22);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 4194304; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_25(double *buf);
+void helper_double_25(double *buf) {
+  helper_double_25_recursive(buf, 25);
+}
+void helper_double_26_recursive(double *buf, int depth);
+void helper_double_26_recursive(double *buf, int depth) {
+  if (depth == 5) {
+    for (int j = 0; j < 32; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_26_recursive(buf + 0, 5);
+    helper_double_26_recursive(buf + 32, 5);
+    helper_double_26_recursive(buf + 64, 5);
+    helper_double_26_recursive(buf + 96, 5);
+    helper_double_26_recursive(buf + 128, 5);
+    helper_double_26_recursive(buf + 160, 5);
+    helper_double_26_recursive(buf + 192, 5);
+    helper_double_26_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_26_recursive(buf + 0, 8);
+    helper_double_26_recursive(buf + 256, 8);
+    helper_double_26_recursive(buf + 512, 8);
+    helper_double_26_recursive(buf + 768, 8);
+    helper_double_26_recursive(buf + 1024, 8);
+    helper_double_26_recursive(buf + 1280, 8);
+    helper_double_26_recursive(buf + 1536, 8);
+    helper_double_26_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_26_recursive(buf + 0, 11);
+    helper_double_26_recursive(buf + 2048, 11);
+    helper_double_26_recursive(buf + 4096, 11);
+    helper_double_26_recursive(buf + 6144, 11);
+    helper_double_26_recursive(buf + 8192, 11);
+    helper_double_26_recursive(buf + 10240, 11);
+    helper_double_26_recursive(buf + 12288, 11);
+    helper_double_26_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_26_recursive(buf + 0, 14);
+    helper_double_26_recursive(buf + 16384, 14);
+    helper_double_26_recursive(buf + 32768, 14);
+    helper_double_26_recursive(buf + 49152, 14);
+    helper_double_26_recursive(buf + 65536, 14);
+    helper_double_26_recursive(buf + 81920, 14);
+    helper_double_26_recursive(buf + 98304, 14);
+    helper_double_26_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_26_recursive(buf + 0, 17);
+    helper_double_26_recursive(buf + 131072, 17);
+    helper_double_26_recursive(buf + 262144, 17);
+    helper_double_26_recursive(buf + 393216, 17);
+    helper_double_26_recursive(buf + 524288, 17);
+    helper_double_26_recursive(buf + 655360, 17);
+    helper_double_26_recursive(buf + 786432, 17);
+    helper_double_26_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_26_recursive(buf + 0, 20);
+    helper_double_26_recursive(buf + 1048576, 20);
+    helper_double_26_recursive(buf + 2097152, 20);
+    helper_double_26_recursive(buf + 3145728, 20);
+    helper_double_26_recursive(buf + 4194304, 20);
+    helper_double_26_recursive(buf + 5242880, 20);
+    helper_double_26_recursive(buf + 6291456, 20);
+    helper_double_26_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_double_26_recursive(buf + 0, 23);
+    helper_double_26_recursive(buf + 8388608, 23);
+    helper_double_26_recursive(buf + 16777216, 23);
+    helper_double_26_recursive(buf + 25165824, 23);
+    helper_double_26_recursive(buf + 33554432, 23);
+    helper_double_26_recursive(buf + 41943040, 23);
+    helper_double_26_recursive(buf + 50331648, 23);
+    helper_double_26_recursive(buf + 58720256, 23);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 8388608; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_26(double *buf);
+void helper_double_26(double *buf) {
+  helper_double_26_recursive(buf, 26);
+}
+void helper_double_27_recursive(double *buf, int depth);
+void helper_double_27_recursive(double *buf, int depth) {
+  if (depth == 6) {
+    for (int j = 0; j < 64; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 9) {
+    helper_double_27_recursive(buf + 0, 6);
+    helper_double_27_recursive(buf + 64, 6);
+    helper_double_27_recursive(buf + 128, 6);
+    helper_double_27_recursive(buf + 192, 6);
+    helper_double_27_recursive(buf + 256, 6);
+    helper_double_27_recursive(buf + 320, 6);
+    helper_double_27_recursive(buf + 384, 6);
+    helper_double_27_recursive(buf + 448, 6);
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_27_recursive(buf + 0, 9);
+    helper_double_27_recursive(buf + 512, 9);
+    helper_double_27_recursive(buf + 1024, 9);
+    helper_double_27_recursive(buf + 1536, 9);
+    helper_double_27_recursive(buf + 2048, 9);
+    helper_double_27_recursive(buf + 2560, 9);
+    helper_double_27_recursive(buf + 3072, 9);
+    helper_double_27_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_27_recursive(buf + 0, 12);
+    helper_double_27_recursive(buf + 4096, 12);
+    helper_double_27_recursive(buf + 8192, 12);
+    helper_double_27_recursive(buf + 12288, 12);
+    helper_double_27_recursive(buf + 16384, 12);
+    helper_double_27_recursive(buf + 20480, 12);
+    helper_double_27_recursive(buf + 24576, 12);
+    helper_double_27_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_27_recursive(buf + 0, 15);
+    helper_double_27_recursive(buf + 32768, 15);
+    helper_double_27_recursive(buf + 65536, 15);
+    helper_double_27_recursive(buf + 98304, 15);
+    helper_double_27_recursive(buf + 131072, 15);
+    helper_double_27_recursive(buf + 163840, 15);
+    helper_double_27_recursive(buf + 196608, 15);
+    helper_double_27_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_27_recursive(buf + 0, 18);
+    helper_double_27_recursive(buf + 262144, 18);
+    helper_double_27_recursive(buf + 524288, 18);
+    helper_double_27_recursive(buf + 786432, 18);
+    helper_double_27_recursive(buf + 1048576, 18);
+    helper_double_27_recursive(buf + 1310720, 18);
+    helper_double_27_recursive(buf + 1572864, 18);
+    helper_double_27_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_27_recursive(buf + 0, 21);
+    helper_double_27_recursive(buf + 2097152, 21);
+    helper_double_27_recursive(buf + 4194304, 21);
+    helper_double_27_recursive(buf + 6291456, 21);
+    helper_double_27_recursive(buf + 8388608, 21);
+    helper_double_27_recursive(buf + 10485760, 21);
+    helper_double_27_recursive(buf + 12582912, 21);
+    helper_double_27_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_double_27_recursive(buf + 0, 24);
+    helper_double_27_recursive(buf + 16777216, 24);
+    helper_double_27_recursive(buf + 33554432, 24);
+    helper_double_27_recursive(buf + 50331648, 24);
+    helper_double_27_recursive(buf + 67108864, 24);
+    helper_double_27_recursive(buf + 83886080, 24);
+    helper_double_27_recursive(buf + 100663296, 24);
+    helper_double_27_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_27(double *buf);
+void helper_double_27(double *buf) {
+  helper_double_27_recursive(buf, 27);
+}
+void helper_double_28_recursive(double *buf, int depth);
+void helper_double_28_recursive(double *buf, int depth) {
+  if (depth == 14) {
+    for (int j = 0; j < 16384; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 16384; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 16384; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 16384; j += 8192) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 8192; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_28_recursive(buf + 0, 14);
+    helper_double_28_recursive(buf + 16384, 14);
+    helper_double_28_recursive(buf + 32768, 14);
+    helper_double_28_recursive(buf + 49152, 14);
+    helper_double_28_recursive(buf + 65536, 14);
+    helper_double_28_recursive(buf + 81920, 14);
+    helper_double_28_recursive(buf + 98304, 14);
+    helper_double_28_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_28_recursive(buf + 0, 17);
+    helper_double_28_recursive(buf + 131072, 17);
+    helper_double_28_recursive(buf + 262144, 17);
+    helper_double_28_recursive(buf + 393216, 17);
+    helper_double_28_recursive(buf + 524288, 17);
+    helper_double_28_recursive(buf + 655360, 17);
+    helper_double_28_recursive(buf + 786432, 17);
+    helper_double_28_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_28_recursive(buf + 0, 20);
+    helper_double_28_recursive(buf + 1048576, 20);
+    helper_double_28_recursive(buf + 2097152, 20);
+    helper_double_28_recursive(buf + 3145728, 20);
+    helper_double_28_recursive(buf + 4194304, 20);
+    helper_double_28_recursive(buf + 5242880, 20);
+    helper_double_28_recursive(buf + 6291456, 20);
+    helper_double_28_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_double_28_recursive(buf + 0, 23);
+    helper_double_28_recursive(buf + 8388608, 23);
+    helper_double_28_recursive(buf + 16777216, 23);
+    helper_double_28_recursive(buf + 25165824, 23);
+    helper_double_28_recursive(buf + 33554432, 23);
+    helper_double_28_recursive(buf + 41943040, 23);
+    helper_double_28_recursive(buf + 50331648, 23);
+    helper_double_28_recursive(buf + 58720256, 23);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 8388608; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 28) {
+    helper_double_28_recursive(buf + 0, 26);
+    helper_double_28_recursive(buf + 67108864, 26);
+    helper_double_28_recursive(buf + 134217728, 26);
+    helper_double_28_recursive(buf + 201326592, 26);
+    for (int j = 0; j < 268435456; j += 268435456) {
+      for (int k = 0; k < 67108864; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_28(double *buf);
+void helper_double_28(double *buf) {
+  helper_double_28_recursive(buf, 28);
+}
+void helper_double_29_recursive(double *buf, int depth);
+void helper_double_29_recursive(double *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_29_recursive(buf + 0, 9);
+    helper_double_29_recursive(buf + 512, 9);
+    helper_double_29_recursive(buf + 1024, 9);
+    helper_double_29_recursive(buf + 1536, 9);
+    helper_double_29_recursive(buf + 2048, 9);
+    helper_double_29_recursive(buf + 2560, 9);
+    helper_double_29_recursive(buf + 3072, 9);
+    helper_double_29_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_29_recursive(buf + 0, 12);
+    helper_double_29_recursive(buf + 4096, 12);
+    helper_double_29_recursive(buf + 8192, 12);
+    helper_double_29_recursive(buf + 12288, 12);
+    helper_double_29_recursive(buf + 16384, 12);
+    helper_double_29_recursive(buf + 20480, 12);
+    helper_double_29_recursive(buf + 24576, 12);
+    helper_double_29_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_29_recursive(buf + 0, 15);
+    helper_double_29_recursive(buf + 32768, 15);
+    helper_double_29_recursive(buf + 65536, 15);
+    helper_double_29_recursive(buf + 98304, 15);
+    helper_double_29_recursive(buf + 131072, 15);
+    helper_double_29_recursive(buf + 163840, 15);
+    helper_double_29_recursive(buf + 196608, 15);
+    helper_double_29_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_29_recursive(buf + 0, 18);
+    helper_double_29_recursive(buf + 262144, 18);
+    helper_double_29_recursive(buf + 524288, 18);
+    helper_double_29_recursive(buf + 786432, 18);
+    helper_double_29_recursive(buf + 1048576, 18);
+    helper_double_29_recursive(buf + 1310720, 18);
+    helper_double_29_recursive(buf + 1572864, 18);
+    helper_double_29_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_29_recursive(buf + 0, 21);
+    helper_double_29_recursive(buf + 2097152, 21);
+    helper_double_29_recursive(buf + 4194304, 21);
+    helper_double_29_recursive(buf + 6291456, 21);
+    helper_double_29_recursive(buf + 8388608, 21);
+    helper_double_29_recursive(buf + 10485760, 21);
+    helper_double_29_recursive(buf + 12582912, 21);
+    helper_double_29_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_double_29_recursive(buf + 0, 24);
+    helper_double_29_recursive(buf + 16777216, 24);
+    helper_double_29_recursive(buf + 33554432, 24);
+    helper_double_29_recursive(buf + 50331648, 24);
+    helper_double_29_recursive(buf + 67108864, 24);
+    helper_double_29_recursive(buf + 83886080, 24);
+    helper_double_29_recursive(buf + 100663296, 24);
+    helper_double_29_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 29) {
+    helper_double_29_recursive(buf + 0, 27);
+    helper_double_29_recursive(buf + 134217728, 27);
+    helper_double_29_recursive(buf + 268435456, 27);
+    helper_double_29_recursive(buf + 402653184, 27);
+    for (int j = 0; j < 536870912; j += 536870912) {
+      for (int k = 0; k < 134217728; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_29(double *buf);
+void helper_double_29(double *buf) {
+  helper_double_29_recursive(buf, 29);
+}
+void helper_double_30_recursive(double *buf, int depth);
+void helper_double_30_recursive(double *buf, int depth) {
+  if (depth == 3) {
+    for (int j = 0; j < 8; j += 8) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 6) {
+    helper_double_30_recursive(buf + 0, 3);
+    helper_double_30_recursive(buf + 8, 3);
+    helper_double_30_recursive(buf + 16, 3);
+    helper_double_30_recursive(buf + 24, 3);
+    helper_double_30_recursive(buf + 32, 3);
+    helper_double_30_recursive(buf + 40, 3);
+    helper_double_30_recursive(buf + 48, 3);
+    helper_double_30_recursive(buf + 56, 3);
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 8; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 9) {
+    helper_double_30_recursive(buf + 0, 6);
+    helper_double_30_recursive(buf + 64, 6);
+    helper_double_30_recursive(buf + 128, 6);
+    helper_double_30_recursive(buf + 192, 6);
+    helper_double_30_recursive(buf + 256, 6);
+    helper_double_30_recursive(buf + 320, 6);
+    helper_double_30_recursive(buf + 384, 6);
+    helper_double_30_recursive(buf + 448, 6);
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_30_recursive(buf + 0, 9);
+    helper_double_30_recursive(buf + 512, 9);
+    helper_double_30_recursive(buf + 1024, 9);
+    helper_double_30_recursive(buf + 1536, 9);
+    helper_double_30_recursive(buf + 2048, 9);
+    helper_double_30_recursive(buf + 2560, 9);
+    helper_double_30_recursive(buf + 3072, 9);
+    helper_double_30_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_30_recursive(buf + 0, 12);
+    helper_double_30_recursive(buf + 4096, 12);
+    helper_double_30_recursive(buf + 8192, 12);
+    helper_double_30_recursive(buf + 12288, 12);
+    helper_double_30_recursive(buf + 16384, 12);
+    helper_double_30_recursive(buf + 20480, 12);
+    helper_double_30_recursive(buf + 24576, 12);
+    helper_double_30_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_30_recursive(buf + 0, 15);
+    helper_double_30_recursive(buf + 32768, 15);
+    helper_double_30_recursive(buf + 65536, 15);
+    helper_double_30_recursive(buf + 98304, 15);
+    helper_double_30_recursive(buf + 131072, 15);
+    helper_double_30_recursive(buf + 163840, 15);
+    helper_double_30_recursive(buf + 196608, 15);
+    helper_double_30_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_30_recursive(buf + 0, 18);
+    helper_double_30_recursive(buf + 262144, 18);
+    helper_double_30_recursive(buf + 524288, 18);
+    helper_double_30_recursive(buf + 786432, 18);
+    helper_double_30_recursive(buf + 1048576, 18);
+    helper_double_30_recursive(buf + 1310720, 18);
+    helper_double_30_recursive(buf + 1572864, 18);
+    helper_double_30_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_30_recursive(buf + 0, 21);
+    helper_double_30_recursive(buf + 2097152, 21);
+    helper_double_30_recursive(buf + 4194304, 21);
+    helper_double_30_recursive(buf + 6291456, 21);
+    helper_double_30_recursive(buf + 8388608, 21);
+    helper_double_30_recursive(buf + 10485760, 21);
+    helper_double_30_recursive(buf + 12582912, 21);
+    helper_double_30_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_double_30_recursive(buf + 0, 24);
+    helper_double_30_recursive(buf + 16777216, 24);
+    helper_double_30_recursive(buf + 33554432, 24);
+    helper_double_30_recursive(buf + 50331648, 24);
+    helper_double_30_recursive(buf + 67108864, 24);
+    helper_double_30_recursive(buf + 83886080, 24);
+    helper_double_30_recursive(buf + 100663296, 24);
+    helper_double_30_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 30) {
+    helper_double_30_recursive(buf + 0, 27);
+    helper_double_30_recursive(buf + 134217728, 27);
+    helper_double_30_recursive(buf + 268435456, 27);
+    helper_double_30_recursive(buf + 402653184, 27);
+    helper_double_30_recursive(buf + 536870912, 27);
+    helper_double_30_recursive(buf + 671088640, 27);
+    helper_double_30_recursive(buf + 805306368, 27);
+    helper_double_30_recursive(buf + 939524096, 27);
+    for (int j = 0; j < 1073741824; j += 1073741824) {
+      for (int k = 0; k < 134217728; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_30(double *buf);
+void helper_double_30(double *buf) {
+  helper_double_30_recursive(buf, 30);
+}
+int fht_double(double *buf, int log_n) {
+  if (log_n == 0) {
+    return 0;
+  }
+  if (log_n == 1) {
+    helper_double_1(buf);
+    return 0;
+  }
+  if (log_n == 2) {
+    helper_double_2(buf);
+    return 0;
+  }
+  if (log_n == 3) {
+    helper_double_3(buf);
+    return 0;
+  }
+  if (log_n == 4) {
+    helper_double_4(buf);
+    return 0;
+  }
+  if (log_n == 5) {
+    helper_double_5(buf);
+    return 0;
+  }
+  if (log_n == 6) {
+    helper_double_6(buf);
+    return 0;
+  }
+  if (log_n == 7) {
+    helper_double_7(buf);
+    return 0;
+  }
+  if (log_n == 8) {
+    helper_double_8(buf);
+    return 0;
+  }
+  if (log_n == 9) {
+    helper_double_9(buf);
+    return 0;
+  }
+  if (log_n == 10) {
+    helper_double_10(buf);
+    return 0;
+  }
+  if (log_n == 11) {
+    helper_double_11(buf);
+    return 0;
+  }
+  if (log_n == 12) {
+    helper_double_12(buf);
+    return 0;
+  }
+  if (log_n == 13) {
+    helper_double_13(buf);
+    return 0;
+  }
+  if (log_n == 14) {
+    helper_double_14(buf);
+    return 0;
+  }
+  if (log_n == 15) {
+    helper_double_15(buf);
+    return 0;
+  }
+  if (log_n == 16) {
+    helper_double_16(buf);
+    return 0;
+  }
+  if (log_n == 17) {
+    helper_double_17(buf);
+    return 0;
+  }
+  if (log_n == 18) {
+    helper_double_18(buf);
+    return 0;
+  }
+  if (log_n == 19) {
+    helper_double_19(buf);
+    return 0;
+  }
+  if (log_n == 20) {
+    helper_double_20(buf);
+    return 0;
+  }
+  if (log_n == 21) {
+    helper_double_21(buf);
+    return 0;
+  }
+  if (log_n == 22) {
+    helper_double_22(buf);
+    return 0;
+  }
+  if (log_n == 23) {
+    helper_double_23(buf);
+    return 0;
+  }
+  if (log_n == 24) {
+    helper_double_24(buf);
+    return 0;
+  }
+  if (log_n == 25) {
+    helper_double_25(buf);
+    return 0;
+  }
+  if (log_n == 26) {
+    helper_double_26(buf);
+    return 0;
+  }
+  if (log_n == 27) {
+    helper_double_27(buf);
+    return 0;
+  }
+  if (log_n == 28) {
+    helper_double_28(buf);
+    return 0;
+  }
+  if (log_n == 29) {
+    helper_double_29(buf);
+    return 0;
+  }
+  if (log_n == 30) {
+    helper_double_30(buf);
+    return 0;
+  }
+  return 1;
+}
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py b/extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py
new file mode 100644
index 00000000000..bf3655efda4
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py
@@ -0,0 +1,869 @@
+# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
+import csv
+import os
+import subprocess
+import sys
+
+max_log_n = 30
+
+
+def is_distinct(l):
+    return len(set(l)) == len(l)
+
+
+def float_avx_0(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 4:
+        raise Exception("float_avx_0 needs at least four auxiliary registers")
+    # given source ABCDEFGH, destination register gets AACCEEGG
+    res = ident + '"vpermilps $160, %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0])
+    # given source ABCDEFGH, destination register gets BBDDFFHH
+    res += ident + '"vpermilps $245, %%%%%s, %%%%%s\\n"\n' % (
+        register,
+        aux_registers[1],
+    )
+    # aux2 <- 0
+    res += ident + '"vxorps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[2],
+        aux_registers[2],
+        aux_registers[2],
+    )
+    # aux3 <- -B -B -D -D -F -F -H -H
+    res += ident + '"vsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        aux_registers[2],
+        aux_registers[3],
+    )
+    # reg <- (A+B)(A-B)(C+D)(C-D)(E+F)(E-F)(G+H)(G-H)
+    res += ident + '"vaddsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[3],
+        aux_registers[0],
+        register,
+    )
+    return res
+
+
+def float_avx_1(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 5:
+        raise Exception("float_avx_1 needs at least five auxiliary registers")
+    # Given source ABCDEFGH, r0 <- ABABEFEF
+    res = ident + '"vpermilps $68, %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0])
+    # Given source ABCDEFGH, r1 <- CDCDGHGH
+    res += ident + '"vpermilps $238, %%%%%s, %%%%%s\\n"\n' % (
+        register,
+        aux_registers[1],
+    )
+    # r2 <- 0
+    res += ident + '"vxorps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[2],
+        aux_registers[2],
+        aux_registers[2],
+    )
+    # r3 <- -C -D -C -D -G -H -G -H
+    res += ident + '"vsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        aux_registers[2],
+        aux_registers[3],
+    )
+    # r4 <- C D -C -D G H -G -H
+    res += ident + '"vblendps $204, %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[3],
+        aux_registers[1],
+        aux_registers[4],
+    )
+    # reg <- (A + C) (B + D) (A - C) (B - D) etc.
+    res += ident + '"vaddps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[0],
+        aux_registers[4],
+        register,
+    )
+    return res
+
+
+def float_avx_2(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 4:
+        raise Exception("float_avx_2 needs at least four auxiliary registers")
+    # r0 <- 0
+    res = ident + '"vxorps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[0],
+        aux_registers[0],
+        aux_registers[0],
+    )
+    # r1 <- -A -B -C -D -E -F -G -H
+    res += ident + '"vsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        register,
+        aux_registers[0],
+        aux_registers[1],
+    )
+    # r2 <- ABABEFEF
+    res += ident + '"vperm2f128 $0, %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        register,
+        register,
+        aux_registers[2],
+    )
+    # r3 <- C D -C -D G H -G -H
+    res += ident + '"vperm2f128 $49, %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        register,
+        aux_registers[3],
+    )
+    # reg <- (A + C) (B + D)(A - C) (B - D) etc.
+    res += ident + '"vaddps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[2],
+        aux_registers[3],
+        register,
+    )
+    return res
+
+
+def float_avx_3_etc(
+    from_register_0, from_register_1, to_register_0, to_register_1, ident=""
+):
+    if not is_distinct(
+        [from_register_0, from_register_1, to_register_0, to_register_1]
+    ):
+        raise Exception("four registers must be distinct")
+    res = ident + '"vaddps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        from_register_1,
+        from_register_0,
+        to_register_0,
+    )
+    res += ident + '"vsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        from_register_1,
+        from_register_0,
+        to_register_1,
+    )
+    return res
+
+
+def double_avx_0(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 4:
+        raise Exception("double_avx_0 needs at least four auxiliary registers")
+    # r0 <- AACC
+    res = ident + '"vpermilpd $0, %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0])
+    # r1 <- BBDD
+    res += ident + '"vpermilpd $15, %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[1])
+    # r2 <- 0
+    res += ident + '"vxorpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[2],
+        aux_registers[2],
+        aux_registers[2],
+    )
+    # r3 <- -B -B -D -D
+    res += ident + '"vsubpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        aux_registers[2],
+        aux_registers[3],
+    )
+    # reg <- (A + B)(A - B)(C + D)(C - D)
+    res += ident + '"vaddsubpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[3],
+        aux_registers[0],
+        register,
+    )
+    return res
+
+
+def double_avx_1(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 4:
+        raise Exception("double_avx_1 needs at least four auxiliary registers")
+    # r0 <- ABAB
+    res = ident + '"vperm2f128 $0, %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        register,
+        register,
+        aux_registers[0],
+    )
+    # r1 <- 0
+    res += ident + '"vxorpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        aux_registers[1],
+        aux_registers[1],
+    )
+    # r2 <- -A -B -C -D
+    res += ident + '"vsubpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        register,
+        aux_registers[1],
+        aux_registers[2],
+    )
+    # r3 <- C D -C -D
+    res += ident + '"vperm2f128 $49, %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[2],
+        register,
+        aux_registers[3],
+    )
+    # reg <- (A + C)(B + D)(A - C)(B - D)
+    res += ident + '"vaddpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[3],
+        aux_registers[0],
+        register,
+    )
+    return res
+
+
+def double_avx_2_etc(
+    from_register_0, from_register_1, to_register_0, to_register_1, ident=""
+):
+    if not is_distinct(
+        [from_register_0, from_register_1, to_register_0, to_register_1]
+    ):
+        raise Exception("four registers must be distinct")
+    res = ident + '"vaddpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        from_register_1,
+        from_register_0,
+        to_register_0,
+    )
+    res += ident + '"vsubpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        from_register_1,
+        from_register_0,
+        to_register_1,
+    )
+    return res
+
+
+def float_sse_0(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 2:
+        raise Exception("float_sse_0 needs at least two auxiliary registers")
+    res = ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0])
+    res += ident + '"shufps $160, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[0],
+        aux_registers[0],
+    )
+    res += ident + '"shufps $245, %%%%%s, %%%%%s\\n"\n' % (register, register)
+    res += ident + '"xorps %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], aux_registers[1])
+    res += ident + '"subps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[1])
+    res += ident + '"addsubps %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        aux_registers[0],
+    )
+    res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], register)
+    return res
+
+
+def float_sse_1(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 4:
+        raise Exception("float_sse_1 needs at least four auxiliary registers")
+    res = ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0])
+    res += ident + '"shufps $68, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[0],
+        aux_registers[0],
+    )
+    res += ident + '"xorps %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], aux_registers[1])
+    res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[2])
+    res += ident + '"shufps $14, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        aux_registers[2],
+    )
+    res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[3])
+    res += ident + '"shufps $224, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[3],
+        aux_registers[1],
+    )
+    res += ident + '"addps %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], aux_registers[2])
+    res += ident + '"subps %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], aux_registers[2])
+    res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (aux_registers[2], register)
+    return res
+
+
+def float_sse_2_etc(
+    from_register_0, from_register_1, to_register_0, to_register_1, ident=""
+):
+    if not is_distinct(
+        [from_register_0, from_register_1, to_register_0, to_register_1]
+    ):
+        raise Exception("four registers must be distinct")
+    res = ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (from_register_0, to_register_0)
+    res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (from_register_0, to_register_1)
+    res += ident + '"addps %%%%%s, %%%%%s\\n"\n' % (from_register_1, to_register_0)
+    res += ident + '"subps %%%%%s, %%%%%s\\n"\n' % (from_register_1, to_register_1)
+    return res
+
+
+def double_sse_0(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 2:
+        raise Exception("double_sse_0 needs at least two auxiliary registers")
+    res = ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0])
+    res += ident + '"haddpd %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], aux_registers[0])
+    res += ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[1])
+    res += ident + '"hsubpd %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], aux_registers[1])
+    res += ident + '"blendpd $1, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[0],
+        aux_registers[1],
+    )
+    res += ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], register)
+    return res
+
+
+def double_sse_1_etc(
+    from_register_0, from_register_1, to_register_0, to_register_1, ident=""
+):
+    if not is_distinct(
+        [from_register_0, from_register_1, to_register_0, to_register_1]
+    ):
+        raise Exception("four registers must be distinct")
+    res = ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (from_register_0, to_register_0)
+    res += ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (from_register_0, to_register_1)
+    res += ident + '"addpd %%%%%s, %%%%%s\\n"\n' % (from_register_1, to_register_0)
+    res += ident + '"subpd %%%%%s, %%%%%s\\n"\n' % (from_register_1, to_register_1)
+    return res
+
+
+# Given reg = ABCD, return (A+B)(A-B)(C+D)(C-D)
+def float_neon_0(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 2:
+        raise Exception("float_neon_0 needs at least two auxiliary registers")
+    # r0 <- AACC
+    res = f'{ident}"TRN1 {aux_registers[0]}.4S, {register}.4S, {register}.4S\\n"\n'
+    # r1 <- -A -B -C -D
+    res += f'{ident}"FNEG {aux_registers[1]}.4S, {register}.4S\\n"\n'
+    # r2 <- B (-B) D -D
+    res += f'{ident}"TRN2 {aux_registers[1]}.4S, {register}.4S, {aux_registers[1]}.4S\\n"\n'
+    # reg <- (A+B)(A-B)(C+D)(C-D)
+    res += f'{ident}"FADD {register}.4S, {aux_registers[0]}.4S, {aux_registers[1]}.4S\\n"\n'
+
+    return res
+
+
+# Given reg = ABCD, return (A + C)(B + D)(A - C)(B - D)
+def float_neon_1(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 2:
+        raise Exception("float_neon_1 needs at least two auxiliary registers")
+    # r0 <- ABAB
+    res = f'{ident}"DUP {aux_registers[0]}.2D, {register}.D[0]\\n"\n'
+    # r1 <- -A -B -C -D
+    res += f'{ident}"FNEG {aux_registers[1]}.4S, {register}.4S\\n"\n'
+    # r1 <- C D -C -D
+    res += f'{ident}"INS {aux_registers[1]}.D[0], {register}.D[1]\\n"\n'
+    # reg <- (A + C)(B + D)(A - C)(B - D)
+    res += f'{ident}"FADD {register}.4S, {aux_registers[0]}.4S, {aux_registers[1]}.4S\\n"\n'
+
+    return res
+
+
+def float_neon_2_etc(
+    from_register_0, from_register_1, to_register_0, to_register_1, ident=""
+):
+    if not is_distinct(
+        [from_register_0, from_register_1, to_register_0, to_register_1]
+    ):
+        raise Exception("four registers must be distinct")
+    res = f'{ident}"FADD {to_register_0}.4S, {from_register_0}.4S, {from_register_1}.4S\\n"\n'
+    res += f'{ident}"FSUB {to_register_1}.4S, {from_register_0}.4S, {from_register_1}.4S\\n"\n'
+    return res
+
+
+def plain_step(type_name, buf_name, log_n, it, ident=""):
+    if log_n <= 0:
+        raise Exception("log_n must be positive")
+    if it < 0:
+        raise Exception("it must be non-negative")
+    if it >= log_n:
+        raise Exception("it must be smaller than log_n")
+    n = 1 << log_n
+    res = ident + "for (int j = 0; j < %d; j += %d) {\n" % (n, 1 << (it + 1))
+    res += ident + "  for (int k = 0; k < %d; ++k) {\n" % (1 << it)
+    res += ident + "    %s u = %s[j + k];\n" % (type_name, buf_name)
+    res += ident + "    %s v = %s[j + k + %d];\n" % (type_name, buf_name, 1 << it)
+    res += ident + "    %s[j + k] = u + v;\n" % buf_name
+    res += ident + "    %s[j + k + %d] = u - v;\n" % (buf_name, 1 << it)
+    res += ident + "  }\n"
+    res += ident + "}\n"
+    return res
+
+
+MOVE_INSTRUCTION_USE_NEON = "NEON MOV"
+
+
+def composite_step(
+    buf_name,
+    log_n,
+    from_it,
+    to_it,
+    log_w,
+    registers,
+    move_instruction,
+    special_steps,
+    main_step,
+    ident="",
+):
+    # HACK: NEON needs different syntax for loads and stores.
+    use_neon_movs = move_instruction == MOVE_INSTRUCTION_USE_NEON
+    if log_n < log_w:
+        raise Exception("need at least %d elements" % (1 << log_w))
+    num_registers = len(registers)
+    if num_registers % 2 == 1:
+        raise Exception("odd number of registers: %d" % num_registers)
+    num_nontrivial_levels = 0
+    if to_it > log_w:
+        first_nontrivial = max(from_it, log_w)
+        num_nontrivial_levels = to_it - first_nontrivial
+        if 1 << num_nontrivial_levels > num_registers / 2:
+            raise Exception("not enough registers")
+    n = 1 << log_n
+    input_registers = []
+    output_registers = []
+    for i in range(num_registers):
+        if i < num_registers / 2:
+            input_registers.append(registers[i])
+        else:
+            output_registers.append(registers[i])
+    clobber = ", ".join(['"%%%s"' % x for x in registers])
+    if num_nontrivial_levels == 0:
+        res = ident + "for (int j = 0; j < %d; j += %d) {\n" % (n, 1 << log_w)
+        res += ident + "  __asm__ volatile (\n"
+        if use_neon_movs:
+            res += f'{ident}    "LD1 {{{input_registers[0]}.4S}}, [%0]\\n"\n'
+        else:
+            res += ident + '    "%s (%%0), %%%%%s\\n"\n' % (
+                move_instruction,
+                input_registers[0],
+            )
+        for it in range(from_it, to_it):
+            res += special_steps[it](
+                input_registers[0], output_registers, ident + "    "
+            )
+        if use_neon_movs:
+            res += f'{ident}    "ST1 {{{input_registers[0]}.4S}}, [%0]\\n"\n'
+        else:
+            res += ident + '    "%s %%%%%s, (%%0)\\n"\n' % (
+                move_instruction,
+                input_registers[0],
+            )
+        res += ident + '    :: "r"(%s + j) : %s, "memory"\n' % (buf_name, clobber)
+        res += ident + "  );\n"
+        res += ident + "}\n"
+        return res
+    res = ident + "for (int j = 0; j < %d; j += %d) {\n" % (n, 1 << to_it)
+    res += ident + "  for (int k = 0; k < %d; k += %d) {\n" % (
+        1 << (to_it - num_nontrivial_levels),
+        1 << log_w,
+    )
+    subcube = []
+    for l in range(1 << num_nontrivial_levels):
+        subcube.append("j + k + " + str(l * (1 << (to_it - num_nontrivial_levels))))
+    res += ident + "    __asm__ volatile (\n"
+    for l in range(1 << num_nontrivial_levels):
+        if use_neon_movs:
+            res += f'{ident}      "LD1 {{{input_registers[l]}.4S}}, [%{l}]\\n"\n'
+        else:
+            res += ident + '      "%s (%%%d), %%%%%s\\n"\n' % (
+                move_instruction,
+                l,
+                input_registers[l],
+            )
+    for it in range(from_it, log_w):
+        for ii in range(1 << num_nontrivial_levels):
+            res += special_steps[it](
+                input_registers[ii], output_registers, ident + "      "
+            )
+    for it in range(num_nontrivial_levels):
+        for ii in range(0, 1 << num_nontrivial_levels, 1 << (it + 1)):
+            for jj in range(1 << it):
+                res += main_step(
+                    input_registers[ii + jj],
+                    input_registers[ii + jj + (1 << it)],
+                    output_registers[ii + jj],
+                    output_registers[ii + jj + (1 << it)],
+                    ident + "      ",
+                )
+        tmp = input_registers
+        input_registers = output_registers
+        output_registers = tmp
+    for l in range(1 << num_nontrivial_levels):
+        if use_neon_movs:
+            res += f'{ident}      "ST1 {{{input_registers[l]}.4S}}, [%{l}]\\n"\n'
+        else:
+            res += ident + '      "%s %%%%%s, (%%%d)\\n"\n' % (
+                move_instruction,
+                input_registers[l],
+                l,
+            )
+    res += ident + '      :: %s : %s, "memory"\n' % (
+        ", ".join(['"r"(%s + %s)' % (buf_name, x) for x in subcube]),
+        clobber,
+    )
+    res += ident + "    );\n"
+    res += ident + "  }\n"
+    res += ident + "}\n"
+    return res
+
+
+def float_avx_composite_step(buf_name, log_n, from_it, to_it, ident=""):
+    return composite_step(
+        buf_name,
+        log_n,
+        from_it,
+        to_it,
+        3,
+        ["ymm%d" % x for x in range(16)],
+        "vmovups",
+        [float_avx_0, float_avx_1, float_avx_2],
+        float_avx_3_etc,
+        ident,
+    )
+
+
+def double_avx_composite_step(buf_name, log_n, from_it, to_it, ident=""):
+    return composite_step(
+        buf_name,
+        log_n,
+        from_it,
+        to_it,
+        2,
+        ["ymm%d" % x for x in range(16)],
+        "vmovupd",
+        [double_avx_0, double_avx_1],
+        double_avx_2_etc,
+        ident,
+    )
+
+
+def float_sse_composite_step(buf_name, log_n, from_it, to_it, ident=""):
+    return composite_step(
+        buf_name,
+        log_n,
+        from_it,
+        to_it,
+        2,
+        ["xmm%d" % x for x in range(16)],
+        "movups",
+        [float_sse_0, float_sse_1],
+        float_sse_2_etc,
+        ident,
+    )
+
+
+def double_sse_composite_step(buf_name, log_n, from_it, to_it, ident=""):
+    return composite_step(
+        buf_name,
+        log_n,
+        from_it,
+        to_it,
+        1,
+        ["xmm%d" % x for x in range(16)],
+        "movupd",
+        [double_sse_0],
+        double_sse_1_etc,
+        ident,
+    )
+
+
+NEON_VECTOR_REGS = [f"v{x}" for x in range(0, 32)]
+
+
+def float_neon_composite_step(buf_name, log_n, from_it, to_it, ident=""):
+    return composite_step(
+        buf_name,
+        log_n,
+        from_it,
+        to_it,
+        2,
+        NEON_VECTOR_REGS,
+        MOVE_INSTRUCTION_USE_NEON,
+        [float_neon_0, float_neon_1],
+        float_neon_2_etc,
+        ident,
+    )
+
+
+def plain_unmerged(type_name, log_n):
+    signature = "static inline void helper_%s_%d(%s *buf)" % (
+        type_name,
+        log_n,
+        type_name,
+    )
+    res = "%s;\n" % signature
+    res += "%s {\n" % signature
+    for i in range(log_n):
+        res += plain_step(type_name, "buf", log_n, i, "  ")
+    res += "}\n"
+    return res
+
+
+def greedy_merged(type_name, log_n, composite_step):
+    try:
+        composite_step("buf", log_n, 0, 0)
+    except Exception:
+        raise Exception("log_n is too small: %d" % log_n)
+    signature = "static inline void helper_%s_%d(%s *buf)" % (
+        type_name,
+        log_n,
+        type_name,
+    )
+    res = "%s;\n" % signature
+    res += "%s {\n" % signature
+    cur_it = 0
+    while cur_it < log_n:
+        cur_to_it = log_n
+        while True:
+            try:
+                composite_step("buf", log_n, cur_it, cur_to_it)
+                break
+            except Exception as e:
+                print(f"warning: {e}")
+                cur_to_it -= 1
+                continue
+        res += composite_step("buf", log_n, cur_it, cur_to_it, "  ")
+        cur_it = cur_to_it
+    res += "}\n"
+    return res
+
+
+def greedy_merged_recursive(type_name, log_n, threshold_step, composite_step):
+    if threshold_step > log_n:
+        raise Exception("threshold_step must be at most log_n")
+    try:
+        composite_step("buf", threshold_step, 0, 0)
+    except Exception:
+        raise Exception("threshold_step is too small: %d" % threshold_step)
+    signature = "void helper_%s_%d_recursive(%s *buf, int depth)" % (
+        type_name,
+        log_n,
+        type_name,
+    )
+    res = "%s;\n" % signature
+    res += "%s {\n" % signature
+    res += "  if (depth == %d) {\n" % threshold_step
+    if threshold_step == log_n:
+        cur_it = 0
+        while cur_it < threshold_step:
+            cur_to_it = threshold_step
+            while True:
+                try:
+                    composite_step("buf", threshold_step, cur_it, cur_to_it)
+                    break
+                except Exception:
+                    cur_to_it -= 1
+                    continue
+            res += composite_step("buf", threshold_step, cur_it, cur_to_it, "    ")
+            cur_it = cur_to_it
+    else:
+        res += "    helper_%s_%d(buf);\n" % (type_name, threshold_step)
+
+    res += "    return;\n"
+    res += "  }\n"
+    cur_it = threshold_step
+    while cur_it < log_n:
+        cur_to_it = log_n
+        while True:
+            try:
+                composite_step("buf", cur_to_it, cur_it, cur_to_it)
+                break
+            except Exception:
+                cur_to_it -= 1
+                continue
+        res += "  if (depth == %d) {\n" % cur_to_it
+        for i in range(1 << (cur_to_it - cur_it)):
+            res += "    helper_%s_%d_recursive(buf + %d, %d);\n" % (
+                type_name,
+                log_n,
+                i * (1 << cur_it),
+                cur_it,
+            )
+        if cur_to_it < log_n:
+            res += "    helper_%s_%d(buf);" % (type_name, cur_to_it)
+        else:
+            res += composite_step("buf", cur_to_it, cur_it, cur_to_it, "    ")
+        res += "    return;\n"
+        res += "  }\n"
+        cur_it = cur_to_it
+    res += "}\n"
+    signature = "void helper_%s_%d(%s *buf)" % (type_name, log_n, type_name)
+    res += "%s;\n" % signature
+    res += "%s {\n" % signature
+    res += "  helper_%s_%d_recursive(buf, %d);\n" % (type_name, log_n, log_n)
+    res += "}\n"
+    return res
+
+
+def extract_time(data):
+    cpu_time = float(data["cpu_time"])
+    time_unit = data["time_unit"]
+    if time_unit != "ns":
+        raise Exception("nanoseconds expected")
+    return cpu_time / 1e9
+
+
+def get_mean_stddev():
+    with open("measurements/output.csv", "r") as csvfile:
+        reader = csv.reader(csvfile)
+        first = True
+        for row in reader:
+            if first:
+                header = row
+                first = False
+            else:
+                data = {}
+                for x, y in zip(header, row):
+                    data[x] = y
+                if data["name"] == "benchmark_fht_mean":
+                    mean = extract_time(data)
+                elif data["name"] == "benchmark_fht_stddev":
+                    stddev = extract_time(data)
+    return mean
+
+
+def measure_time(code, log_n, type_name, method_name, num_it=3):
+    if num_it % 2 == 0:
+        raise Exception("even number of runs: %d" % num_it)
+    with open("measurements/to_run.h", "w") as output:
+        output.write(code)
+        output.write("const int log_n = %d;\n" % log_n)
+        signature = "void run(%s *buf)" % type_name
+        output.write("%s;\n" % signature)
+        output.write("%s {\n" % signature)
+        output.write("  %s(buf);\n" % method_name)
+        output.write("}\n")
+    with open("/dev/null", "wb") as devnull:
+        code = subprocess.call(
+            "cd measurements && make run_%s" % type_name, shell=True, stdout=devnull
+        )
+        if code != 0:
+            raise Exception("bad exit code")
+        code = subprocess.call(
+            "./measurements/run_%s --benchmark_repetitions=%d --benchmark_format=csv > ./measurements/output.csv"
+            % (type_name, num_it),
+            shell=True,
+            stderr=devnull,
+        )
+        if code != 0:
+            raise Exception("bad exit code")
+    return get_mean_stddev()
+
+
+# Configuration parameter; set to False if you want the absolute fastest code without regard to size.
+CARE_ABOUT_CODE_SIZE = True
+
+# When CARE_ABOUT_CODE_SIZE, accept the smallest code that is not slower than
+# MAX_PERFORMANCE_PENALTY_FOR_REDUCED_SIZE * the fastest time.
+MAX_PERFORMANCE_PENALTY_FOR_REDUCED_SIZE = 1.1
+
+
+if __name__ == "__main__":
+    final_code = '// @generated\n#include "fht.h"\n'
+    code_so_far = ""
+    hall_of_fame = []
+    for type_name, composite_step_generator in [("float", float_neon_composite_step)]:
+        for log_n in range(1, max_log_n + 1):
+            sys.stdout.write("log_n = %d\n" % log_n)
+            times = []
+            try:
+                (res, desc) = (
+                    greedy_merged(type_name, log_n, composite_step_generator),
+                    "greedy_merged",
+                )
+            except Exception:
+                (res, desc) = (plain_unmerged(type_name, log_n), "plain_unmerged")
+            time = measure_time(
+                code_so_far + res, log_n, type_name, "helper_%s_%d" % (type_name, log_n)
+            )
+            code_size = res.count("\n")
+            times.append((time, res, code_size, desc))
+            sys.stdout.write(
+                "log_n = %d; iterative; code_size = %d; time = %.10e\n"
+                % (log_n, code_size, time)
+            )
+            for threshold_step in range(1, log_n + 1):
+                try:
+                    res = greedy_merged_recursive(
+                        type_name, log_n, threshold_step, composite_step_generator
+                    )
+                    time = measure_time(
+                        code_so_far + res,
+                        log_n,
+                        type_name,
+                        "helper_%s_%d" % (type_name, log_n),
+                    )
+                    code_size = res.count("\n")
+                    times.append(
+                        (
+                            time,
+                            res,
+                            code_size,
+                            "greedy_merged_recursive %d" % threshold_step,
+                        )
+                    )
+                    sys.stdout.write(
+                        "log_n = %d; threshold_step = %d; code_size = %d; time = %.10e\n"
+                        % (log_n, threshold_step, code_size, time)
+                    )
+                except Exception as e:
+                    sys.stdout.write(f"FAIL: {threshold_step} ({e})\n")
+            if CARE_ABOUT_CODE_SIZE:
+                fastest_time = min(times)[0]
+                times_by_size = sorted(times, key=lambda x: x[2])
+                for x in times_by_size:
+                    if x[0] <= fastest_time * MAX_PERFORMANCE_PENALTY_FOR_REDUCED_SIZE:
+                        smallest_acceptable = x
+                        break
+                (best_time, best_code, best_code_size, best_desc) = smallest_acceptable
+            else:
+                (best_time, best_code, best_code_size, best_desc) = min(times)
+            hall_of_fame.append((type_name, log_n, best_time, best_desc))
+            final_code += best_code
+            code_so_far += best_code
+            sys.stdout.write(
+                "log_n = %d; best_time = %.10e; %s\n" % (log_n, best_time, best_desc)
+            )
+        final_code += "int fht_%s(%s *buf, int log_n) {\n" % (type_name, type_name)
+        final_code += "  if (log_n == 0) {\n"
+        final_code += "    return 0;\n"
+        final_code += "  }\n"
+        for i in range(1, max_log_n + 1):
+            final_code += "  if (log_n == %d) {\n" % i
+            final_code += "    helper_%s_%d(buf);\n" % (type_name, i)
+            final_code += "    return 0;\n"
+            final_code += "  }\n"
+        final_code += "  return 1;\n"
+        final_code += "}\n"
+    with open("fht_neon.c", "w") as output:
+        output.write(final_code)
+    sys.stdout.write("hall of fame\n")
+    with open("hall_of_fame_neon.txt", "w") as hof:
+        for type_name, log_n, best_time, best_desc in hall_of_fame:
+            s = "type_name = %s; log_n = %d; best_time = %.10e; best_desc = %s\n" % (
+                type_name,
+                log_n,
+                best_time,
+                best_desc,
+            )
+            sys.stdout.write(s)
+            hof.write(s)
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_avx.txt b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_avx.txt
new file mode 100644
index 00000000000..316ac08e5bc
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_avx.txt
@@ -0,0 +1,60 @@
+type_name = float; log_n = 1; best_time = 3.8273900000e-08; best_desc = plain_unmerged
+type_name = float; log_n = 2; best_time = 3.8694400000e-08; best_desc = plain_unmerged
+type_name = float; log_n = 3; best_time = 4.4120100000e-08; best_desc = greedy_merged
+type_name = float; log_n = 4; best_time = 4.6617800000e-08; best_desc = greedy_merged
+type_name = float; log_n = 5; best_time = 4.8970800000e-08; best_desc = greedy_merged
+type_name = float; log_n = 6; best_time = 5.3648500000e-08; best_desc = greedy_merged
+type_name = float; log_n = 7; best_time = 7.1866600000e-08; best_desc = greedy_merged_recursive 7
+type_name = float; log_n = 8; best_time = 1.0585600000e-07; best_desc = greedy_merged_recursive 6
+type_name = float; log_n = 9; best_time = 1.7403800000e-07; best_desc = greedy_merged
+type_name = float; log_n = 10; best_time = 3.4412700000e-07; best_desc = greedy_merged_recursive 10
+type_name = float; log_n = 11; best_time = 6.5679200000e-07; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 12; best_time = 1.3143800000e-06; best_desc = greedy_merged
+type_name = float; log_n = 13; best_time = 2.8488300000e-06; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 14; best_time = 6.1163700000e-06; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 15; best_time = 1.3664400000e-05; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 16; best_time = 3.0120900000e-05; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 17; best_time = 6.5561000000e-05; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 18; best_time = 1.4176100000e-04; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 19; best_time = 3.0320000000e-04; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 20; best_time = 6.7070400000e-04; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 21; best_time = 1.4708400000e-03; best_desc = greedy_merged_recursive 9
+type_name = float; log_n = 22; best_time = 3.9836500000e-03; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 23; best_time = 8.8893400000e-03; best_desc = greedy_merged_recursive 9
+type_name = float; log_n = 24; best_time = 1.9483500000e-02; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 25; best_time = 4.5779600000e-02; best_desc = greedy_merged_recursive 7
+type_name = float; log_n = 26; best_time = 9.7643700000e-02; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 27; best_time = 2.1200800000e-01; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 28; best_time = 4.9995900000e-01; best_desc = greedy_merged_recursive 7
+type_name = float; log_n = 29; best_time = 1.0615600000e+00; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 30; best_time = 2.2532100000e+00; best_desc = greedy_merged_recursive 6
+type_name = double; log_n = 1; best_time = 3.8275100000e-08; best_desc = plain_unmerged
+type_name = double; log_n = 2; best_time = 4.1286300000e-08; best_desc = greedy_merged
+type_name = double; log_n = 3; best_time = 4.3412600000e-08; best_desc = greedy_merged
+type_name = double; log_n = 4; best_time = 4.5500500000e-08; best_desc = greedy_merged_recursive 4
+type_name = double; log_n = 5; best_time = 4.9231800000e-08; best_desc = greedy_merged
+type_name = double; log_n = 6; best_time = 6.2857100000e-08; best_desc = greedy_merged
+type_name = double; log_n = 7; best_time = 8.9013300000e-08; best_desc = greedy_merged
+type_name = double; log_n = 8; best_time = 1.4163900000e-07; best_desc = greedy_merged
+type_name = double; log_n = 9; best_time = 2.7611500000e-07; best_desc = greedy_merged
+type_name = double; log_n = 10; best_time = 5.2217100000e-07; best_desc = greedy_merged
+type_name = double; log_n = 11; best_time = 1.0466200000e-06; best_desc = greedy_merged
+type_name = double; log_n = 12; best_time = 2.3401300000e-06; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 13; best_time = 5.0560300000e-06; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 14; best_time = 1.1394900000e-05; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 15; best_time = 2.5470800000e-05; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 16; best_time = 5.7387600000e-05; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 17; best_time = 1.2497400000e-04; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 18; best_time = 2.6934700000e-04; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 19; best_time = 6.0233800000e-04; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 20; best_time = 1.3345100000e-03; best_desc = greedy_merged_recursive 9
+type_name = double; log_n = 21; best_time = 3.6883500000e-03; best_desc = greedy_merged_recursive 7
+type_name = double; log_n = 22; best_time = 8.6217800000e-03; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 23; best_time = 1.9016200000e-02; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 24; best_time = 4.5002100000e-02; best_desc = greedy_merged_recursive 10
+type_name = double; log_n = 25; best_time = 9.7061600000e-02; best_desc = greedy_merged_recursive 8
+type_name = double; log_n = 26; best_time = 2.1355500000e-01; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 27; best_time = 4.8974200000e-01; best_desc = greedy_merged_recursive 9
+type_name = double; log_n = 28; best_time = 1.0586200000e+00; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 29; best_time = 2.1763100000e+00; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 30; best_time = 4.8801600000e+00; best_desc = greedy_merged_recursive 9
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_neon.txt b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_neon.txt
new file mode 100644
index 00000000000..547009956e5
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_neon.txt
@@ -0,0 +1,30 @@
+type_name = float; log_n = 1; best_time = 4.1929000000e-08; best_desc = plain_unmerged
+type_name = float; log_n = 2; best_time = 4.1758100000e-08; best_desc = greedy_merged
+type_name = float; log_n = 3; best_time = 4.2130400000e-08; best_desc = greedy_merged_recursive 2
+type_name = float; log_n = 4; best_time = 4.1849300000e-08; best_desc = greedy_merged_recursive 3
+type_name = float; log_n = 5; best_time = 4.2931300000e-08; best_desc = greedy_merged_recursive 4
+type_name = float; log_n = 6; best_time = 4.5379000000e-08; best_desc = greedy_merged_recursive 3
+type_name = float; log_n = 7; best_time = 6.4887900000e-08; best_desc = greedy_merged_recursive 3
+type_name = float; log_n = 8; best_time = 1.0970500000e-07; best_desc = greedy_merged
+type_name = float; log_n = 9; best_time = 2.2306600000e-07; best_desc = greedy_merged_recursive 8
+type_name = float; log_n = 10; best_time = 4.4169300000e-07; best_desc = greedy_merged_recursive 8
+type_name = float; log_n = 11; best_time = 9.7532700000e-07; best_desc = greedy_merged_recursive 10
+type_name = float; log_n = 12; best_time = 1.9247200000e-06; best_desc = greedy_merged_recursive 10
+type_name = float; log_n = 13; best_time = 3.6199200000e-06; best_desc = greedy_merged
+type_name = float; log_n = 14; best_time = 8.4450100000e-06; best_desc = greedy_merged_recursive 10
+type_name = float; log_n = 15; best_time = 1.6781100000e-05; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 16; best_time = 3.7584000000e-05; best_desc = greedy_merged_recursive 15
+type_name = float; log_n = 17; best_time = 7.6645500000e-05; best_desc = greedy_merged_recursive 15
+type_name = float; log_n = 18; best_time = 1.7394400000e-04; best_desc = greedy_merged_recursive 17
+type_name = float; log_n = 19; best_time = 3.9186900000e-04; best_desc = greedy_merged_recursive 18
+type_name = float; log_n = 20; best_time = 8.0344800000e-04; best_desc = greedy_merged_recursive 18
+type_name = float; log_n = 21; best_time = 1.8539700000e-03; best_desc = greedy_merged_recursive 20
+type_name = float; log_n = 22; best_time = 3.6448200000e-03; best_desc = greedy_merged_recursive 20
+type_name = float; log_n = 23; best_time = 8.4403500000e-03; best_desc = greedy_merged_recursive 22
+type_name = float; log_n = 24; best_time = 1.8726400000e-02; best_desc = greedy_merged_recursive 23
+type_name = float; log_n = 25; best_time = 3.8848300000e-02; best_desc = greedy_merged_recursive 23
+type_name = float; log_n = 26; best_time = 8.6437100000e-02; best_desc = greedy_merged_recursive 25
+type_name = float; log_n = 27; best_time = 1.9369800000e-01; best_desc = greedy_merged_recursive 26
+type_name = float; log_n = 28; best_time = 3.9619200000e-01; best_desc = greedy_merged_recursive 26
+type_name = float; log_n = 29; best_time = 1.0401300000e+00; best_desc = greedy_merged_recursive 28
+type_name = float; log_n = 30; best_time = 2.0733800000e+00; best_desc = greedy_merged_recursive 29
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_sse.txt b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_sse.txt
new file mode 100644
index 00000000000..67c1d5cfe0d
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_sse.txt
@@ -0,0 +1,60 @@
+type_name = float; log_n = 1; best_time = 3.8234100000e-08; best_desc = plain_unmerged
+type_name = float; log_n = 2; best_time = 3.9592000000e-08; best_desc = greedy_merged
+type_name = float; log_n = 3; best_time = 4.2633300000e-08; best_desc = greedy_merged
+type_name = float; log_n = 4; best_time = 4.5965700000e-08; best_desc = greedy_merged
+type_name = float; log_n = 5; best_time = 5.2128100000e-08; best_desc = greedy_merged
+type_name = float; log_n = 6; best_time = 6.9547900000e-08; best_desc = greedy_merged
+type_name = float; log_n = 7; best_time = 1.0289400000e-07; best_desc = greedy_merged_recursive 7
+type_name = float; log_n = 8; best_time = 1.7410400000e-07; best_desc = greedy_merged
+type_name = float; log_n = 9; best_time = 3.5127900000e-07; best_desc = greedy_merged
+type_name = float; log_n = 10; best_time = 6.8896800000e-07; best_desc = greedy_merged
+type_name = float; log_n = 11; best_time = 1.3963700000e-06; best_desc = greedy_merged
+type_name = float; log_n = 12; best_time = 3.0889100000e-06; best_desc = greedy_merged_recursive 7
+type_name = float; log_n = 13; best_time = 6.3768900000e-06; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 14; best_time = 1.3732600000e-05; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 15; best_time = 2.8962800000e-05; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 16; best_time = 6.2055900000e-05; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 17; best_time = 1.3487500000e-04; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 18; best_time = 2.7989100000e-04; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 19; best_time = 5.9871200000e-04; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 20; best_time = 1.3111100000e-03; best_desc = greedy_merged_recursive 8
+type_name = float; log_n = 21; best_time = 2.7614800000e-03; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 22; best_time = 6.4353000000e-03; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 23; best_time = 1.4471700000e-02; best_desc = greedy_merged_recursive 6
+type_name = float; log_n = 24; best_time = 3.1766100000e-02; best_desc = greedy_merged_recursive 15
+type_name = float; log_n = 25; best_time = 6.9094300000e-02; best_desc = greedy_merged_recursive 8
+type_name = float; log_n = 26; best_time = 1.4882800000e-01; best_desc = greedy_merged_recursive 5
+type_name = float; log_n = 27; best_time = 3.1941300000e-01; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 28; best_time = 6.9037700000e-01; best_desc = greedy_merged_recursive 16
+type_name = float; log_n = 29; best_time = 1.4692400000e+00; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 30; best_time = 3.0698600000e+00; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 1; best_time = 3.8948100000e-08; best_desc = greedy_merged
+type_name = double; log_n = 2; best_time = 4.0811300000e-08; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 3; best_time = 4.3109200000e-08; best_desc = greedy_merged_recursive 3
+type_name = double; log_n = 4; best_time = 4.9621200000e-08; best_desc = greedy_merged
+type_name = double; log_n = 5; best_time = 6.3119400000e-08; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 6; best_time = 9.2256300000e-08; best_desc = greedy_merged
+type_name = double; log_n = 7; best_time = 1.5220200000e-07; best_desc = greedy_merged
+type_name = double; log_n = 8; best_time = 2.9771700000e-07; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 9; best_time = 6.0368400000e-07; best_desc = greedy_merged_recursive 6
+type_name = double; log_n = 10; best_time = 1.2246100000e-06; best_desc = greedy_merged_recursive 10
+type_name = double; log_n = 11; best_time = 2.6907000000e-06; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 12; best_time = 5.6900800000e-06; best_desc = greedy_merged_recursive 10
+type_name = double; log_n = 13; best_time = 1.2392900000e-05; best_desc = greedy_merged
+type_name = double; log_n = 14; best_time = 2.6329500000e-05; best_desc = greedy_merged_recursive 9
+type_name = double; log_n = 15; best_time = 5.6564400000e-05; best_desc = greedy_merged_recursive 10
+type_name = double; log_n = 16; best_time = 1.2357300000e-04; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 17; best_time = 2.5763800000e-04; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 18; best_time = 5.5563300000e-04; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 19; best_time = 1.2115600000e-03; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 20; best_time = 2.5899100000e-03; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 21; best_time = 6.0839900000e-03; best_desc = greedy_merged_recursive 13
+type_name = double; log_n = 22; best_time = 1.3738100000e-02; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 23; best_time = 3.0164700000e-02; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 24; best_time = 6.6689900000e-02; best_desc = greedy_merged_recursive 13
+type_name = double; log_n = 25; best_time = 1.4307200000e-01; best_desc = greedy_merged_recursive 10
+type_name = double; log_n = 26; best_time = 3.0875700000e-01; best_desc = greedy_merged_recursive 5
+type_name = double; log_n = 27; best_time = 6.7026700000e-01; best_desc = greedy_merged_recursive 6
+type_name = double; log_n = 28; best_time = 1.4210300000e+00; best_desc = greedy_merged_recursive 14
+type_name = double; log_n = 29; best_time = 3.0175300000e+00; best_desc = greedy_merged_recursive 9
+type_name = double; log_n = 30; best_time = 6.4575800000e+00; best_desc = greedy_merged_recursive 3
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/Makefile b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/Makefile
new file mode 100644
index 00000000000..807d5fe626b
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/Makefile
@@ -0,0 +1,13 @@
+CXX=g++
+CXX_FLAGS=-O3 -Wall -march=native -std=c++11 `pkg-config benchmark --cflags --libs` -lpthread
+
+.PHONY: run_float run_double clean
+
+run_float:
+	$(CXX) run_float.cpp -o run_float $(CXX_FLAGS)
+
+run_double:
+	$(CXX) run_double.cpp -o run_double $(CXX_FLAGS)
+
+clean:
+	rm -rf run_float run_double
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_double.cpp b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_double.cpp
new file mode 100644
index 00000000000..711456a1f7a
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_double.cpp
@@ -0,0 +1,27 @@
+#include "to_run.h"
+
+#include <benchmark/benchmark.h>
+
+#include <chrono>
+#include <stdexcept>
+
+#include <cstdlib>
+
+static void benchmark_fht(benchmark::State &state) {
+  double *buf;
+  if (posix_memalign((void**)&buf, 32, sizeof(double) * (1 << log_n))) {
+    throw std::runtime_error("posix_memalign failed");
+  }
+  while (state.KeepRunning()) {
+    auto start = std::chrono::high_resolution_clock::now();
+    run(buf);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+    state.SetIterationTime(elapsed_seconds.count());
+  }
+  free(buf);
+}
+
+BENCHMARK(benchmark_fht);
+
+BENCHMARK_MAIN();
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_float.cpp b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_float.cpp
new file mode 100644
index 00000000000..d84159d6ca7
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_float.cpp
@@ -0,0 +1,27 @@
+#include "to_run.h"
+
+#include <benchmark/benchmark.h>
+
+#include <chrono>
+#include <stdexcept>
+
+#include <cstdlib>
+
+static void benchmark_fht(benchmark::State &state) {
+  float *buf;
+  if (posix_memalign((void**)&buf, 32, sizeof(float) * (1 << log_n))) {
+    throw std::runtime_error("posix_memalign failed");
+  }
+  while (state.KeepRunning()) {
+    auto start = std::chrono::high_resolution_clock::now();
+    run(buf);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+    state.SetIterationTime(elapsed_seconds.count());
+  }
+  free(buf);
+}
+
+BENCHMARK(benchmark_fht);
+
+BENCHMARK_MAIN();
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/targets.bzl b/extension/llm/custom_ops/spinquant/third-party/FFHT/targets.bzl
new file mode 100644
index 00000000000..9ba0ae32fb4
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/targets.bzl
@@ -0,0 +1,34 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    runtime.cxx_library(
+        name = "dumb_fht",
+        srcs = ["dumb_fht.c"],
+        exported_headers = ["dumb_fht.h"],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+    )
+
+    runtime.cxx_library(
+        name = "fht",
+        srcs = select({
+            "DEFAULT": [],
+            "ovr_config//cpu:arm64": ["fht_neon.c"],
+            "ovr_config//cpu:x86_64": ["fht_avx.c"],
+        }),
+        exported_headers = ["fht.h"],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+    )
+
+    runtime.cxx_binary(
+        name = "test_float",
+        srcs = ["test_float.c"],
+        deps = [
+            ":dumb_fht",
+            ":fht",
+        ],
+    )
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double.c
new file mode 100644
index 00000000000..f532ae6e2ff
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double.c
@@ -0,0 +1,68 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "fht.h"
+
+void dumb_fht(double *buf, int log_n);
+void dumb_fht(double *buf, int log_n) {
+    int n = 1 << log_n;
+    for (int i = 0; i < log_n; ++i) {
+        int s1 = 1 << i;
+        int s2 = s1 << 1;
+        for (int j = 0; j < n; j += s2) {
+            for (int k = 0; k < s1; ++k) {
+                double u = buf[j + k];
+                double v = buf[j + k + s1];
+                buf[j + k] = u + v;
+                buf[j + k + s1] = u - v;
+            }
+        }
+    }
+}
+
+int main(void) {
+    srand(4057218);
+    for (int log_n = 1; log_n <= 30; ++log_n) {
+        printf("%d ", log_n);
+        int n = 1 << log_n;
+        void *buf = malloc(sizeof(double) * n + 32);
+        char *start = buf;
+        while ((size_t)start % 32 != 0) start = start + 1;
+        double *a = (double*)start;
+        double *aux = (double*)malloc(sizeof(double) * n);
+        for (int i = 0; i < n; ++i) {
+            a[i] = 1.0 - 2.0 * (rand() & 1);
+            aux[i] = a[i];
+        }
+        fht_double(a, log_n);
+        dumb_fht(aux, log_n);
+        double max_error = 0.0;
+        for (int i = 0; i < n; ++i) {
+            double error = fabs(a[i] - aux[i]);
+            if (error > max_error) {
+                max_error = error;
+            }
+        }
+        if (max_error > 1e-5) {
+            printf("ERROR: %.10lf\n", max_error);
+            return 1;
+        }
+        for (int num_it = 10;; num_it *= 2) {
+            clock_t tt1 = clock();
+            for (int it = 0; it < num_it; ++it) {
+                fht_double(a, log_n);
+            }
+            clock_t tt2 = clock();
+            double sec = (tt2 - tt1) / (CLOCKS_PER_SEC + 0.0);
+            if (sec >= 1.0) {
+                printf("%.10e\n", sec / (num_it + 0.0));
+                break;
+            }
+        }
+        free(buf);
+        free(aux);
+    }
+    return 0;
+}
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float.c
new file mode 100644
index 00000000000..4e39d6aff46
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float.c
@@ -0,0 +1,53 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "dumb_fht.h"
+#include "fht.h"
+
+int main(void) {
+  srand(4057218);
+  for (int log_n = 1; log_n <= 30; ++log_n) {
+    printf("%d ", log_n);
+    int n = 1 << log_n;
+    void* buf = malloc(sizeof(float) * n + 32);
+    char* start = buf;
+    while ((size_t)start % 32 != 0)
+      start = start + 1;
+    float* a = (float*)start;
+    float* aux = (float*)malloc(sizeof(double) * n);
+    for (int i = 0; i < n; ++i) {
+      a[i] = 1.0 - 2.0 * (rand() & 1);
+      aux[i] = a[i];
+    }
+    fht_float(a, log_n);
+    dumb_fht(aux, log_n);
+    double max_error = 0.0;
+    for (int i = 0; i < n; ++i) {
+      double error = fabs(a[i] - aux[i]);
+      if (error > max_error) {
+        max_error = error;
+      }
+    }
+    if (max_error > 1e-5) {
+      printf("ERROR: %.10lf\n", max_error);
+      return 1;
+    }
+    for (int num_it = 10;; num_it *= 2) {
+      clock_t tt1 = clock();
+      for (int it = 0; it < num_it; ++it) {
+        fht_float(a, log_n);
+      }
+      clock_t tt2 = clock();
+      double sec = (tt2 - tt1) / (CLOCKS_PER_SEC + 0.0);
+      if (sec >= 1.0) {
+        printf("%.10e\n", sec / (num_it + 0.0));
+        break;
+      }
+    }
+    free(buf);
+    free(aux);
+  }
+  return 0;
+}
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
index fe93f6a422d..488f214e2bf 100644
--- a/extension/llm/custom_ops/targets.bzl
+++ b/extension/llm/custom_ops/targets.bzl
@@ -6,54 +6,75 @@ def define_common_targets():
     The directory containing this targets.bzl file should also contain both
     TARGETS and BUCK files that call this function.
     """
-    runtime.cxx_library(
-        name = "custom_ops",
-        srcs = ["op_sdpa.cpp"],
-        exported_headers = ["op_sdpa.h"],
-        exported_deps = [
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/kernels/optimized:libblas",
-            "//executorch/kernels/optimized:libvec",
-            "//executorch/extension/kernel_util:kernel_util",
-            "//executorch/extension/parallel:thread_parallel",
-            "//executorch/backends/xnnpack/threadpool:threadpool",
-        ],
-        compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"],
-        visibility = [
-            "//executorch/...",
-            "//executorch/extension/llm/custom_ops/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        # @lint-ignore BUCKLINT link_whole
-        link_whole = True,
-        force_static = True,
-    )
+    for mkl_dep in ["", "_mkl_noomp"]:
+        runtime.cxx_library(
+            name = "custom_ops" + mkl_dep,
+            srcs = [
+                "op_fallback.cpp",
+                "op_fast_hadamard_transform.cpp",
+                "op_sdpa.cpp",
+            ],
+            exported_headers = [
+                "op_fallback.h",
+                "op_fast_hadamard_transform.h",
+                "op_sdpa.h",
+            ],
+            exported_deps = [
+                "//executorch/runtime/kernel:kernel_includes",
+                "//executorch/kernels/portable/cpu:scalar_utils",
+                "//executorch/kernels/optimized:libblas{}".format(mkl_dep),
+                "//executorch/kernels/optimized:libvec",
+                "//executorch/extension/kernel_util:kernel_util",
+                "//executorch/extension/parallel:thread_parallel",
+                "//executorch/extension/threadpool:threadpool",
+            ],
+            deps = [
+                "//executorch/kernels/portable/cpu/util:reduce_util",
+                "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
+            ],
+            compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"],
+            visibility = [
+                "//executorch/...",
+                "//executorch/extension/llm/custom_ops/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+            # @lint-ignore BUCKLINT link_whole
+            link_whole = True,
+            force_static = True,
+        )
 
-    runtime.cxx_library(
-        name = "custom_ops_aot_lib",
-        srcs = [
-            "op_sdpa_aot.cpp",
-        ],
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        external_deps = [
-            "libtorch",
-        ],
-        deps = [
-            ":custom_ops",
-            "//executorch/extension/aten_util:aten_bridge",
-        ],
-    )
+        runtime.cxx_library(
+            name = "custom_ops_aot_lib" + mkl_dep,
+            srcs = [
+                "op_fast_hadamard_transform_aten.cpp",
+                "op_sdpa_aot.cpp",
+                "op_tile_crop.cpp",
+                "op_tile_crop_aot.cpp",
+            ],
+            headers = ["op_tile_crop.h"],
+            compiler_flags = ["-Wno-global-constructors"],
+            visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+            external_deps = [
+                "libtorch",
+            ],
+            deps = [
+                ":custom_ops" + mkl_dep,
+                "//executorch/extension/aten_util:aten_bridge",
+            ],
+        )
 
     runtime.python_library(
         name = "custom_ops_aot_py",
         srcs = [
             "sdpa_with_kv_cache.py",
         ],
-        visibility = ["//executorch/..."],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
         deps = [
             "//caffe2:torch",
         ],
diff --git a/extension/llm/custom_ops/test_preprocess_custom_ops.py b/extension/llm/custom_ops/test_preprocess_custom_ops.py
index 50e149ece16..c3922782ea2 100644
--- a/extension/llm/custom_ops/test_preprocess_custom_ops.py
+++ b/extension/llm/custom_ops/test_preprocess_custom_ops.py
@@ -7,7 +7,7 @@
 # pyre-unsafe
 
 import unittest
-from typing import List, Tuple
+from typing import Tuple
 
 import torch
 
@@ -17,33 +17,13 @@
 class PreprocessTest(unittest.TestCase):
 
     def setUp(self):
-        # pad
-        self.pad_input = torch.ones(3, 200, 300)
-
         # tile_crop
         self.tile_size = 224
 
-    def _compare_pad(self, image: torch.Tensor, padding: List[int]) -> None:
-        output = torch.ops.preprocess.pad.default(image, padding)
-        output_ref = torch.nn.functional.pad(image, padding)
-        self.assertTrue(torch.allclose(output_ref, output, 1e-6))
-
     def _test_tile_crop(self, image: torch.Tensor, expected_shape: Tuple[int]) -> None:
         output = torch.ops.preprocess.tile_crop.default(image, self.tile_size)
         self.assertTrue(output.shape == expected_shape)
 
-    def test_op_pad_without_padding(self):
-        self._compare_pad(self.pad_input, [0, 0, 0, 0])
-
-    def test_op_pad_with_right_bottom_padding(self):
-        self._compare_pad(self.pad_input, [0, 124, 0, 148])
-
-    def test_op_pad_with_right_padding(self):
-        self._compare_pad(self.pad_input, [0, 124, 0, 0])
-
-    def test_op_pad_with_bottom_padding(self):
-        self._compare_pad(self.pad_input, [0, 0, 0, 148])
-
     def test_op_tile_crop_2x2(self):
         self._test_tile_crop(torch.ones(3, 448, 448), (4, 3, 224, 224))
 
diff --git a/extension/llm/custom_ops/test_sdpa_with_kv_cache.py b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
index a1b36e688f9..bfd64cb8975 100644
--- a/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
+++ b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
@@ -373,10 +373,10 @@ class SDPATestCommon(unittest.TestCase):
 
     def setup_caches(self):
         self.k_cache = torch.zeros(
-            (1, self.max_seq_len, self.n_heads_kv, self.head_dim)
+            (self.n_batch, self.max_seq_len, self.n_heads_kv, self.head_dim)
         )
         self.v_cache = torch.zeros(
-            (1, self.max_seq_len, self.n_heads_kv, self.head_dim)
+            (self.n_batch, self.max_seq_len, self.n_heads_kv, self.head_dim)
         )
         self.mask = torch.full(
             (self.max_seq_len, self.max_seq_len),
@@ -386,23 +386,57 @@ def setup_caches(self):
 
     def setUp(self):
         torch.manual_seed(42)
+        self.n_batch = 5
         self.n_heads_kv = 32
         self.n_heads_q = 32
         self.head_dim = 128
         self.max_seq_len = 2048
         self.setup_caches()
 
+    def _scale_tensor(self, tensor, min_value, max_value, scale=True):
+        normalized_tensor = (tensor - tensor.min()) / (tensor.max() - tensor.min())
+
+        scaled_tensor = normalized_tensor * (max_value - min_value) + min_value
+
+        return scaled_tensor if scale else tensor
+
     def _test_sdpa_common(
-        self, n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, next_iter_seq_len=1
+        self,
+        n_heads_kv,
+        n_heads_q,
+        head_dim,
+        max_seq_len,
+        seq_len,
+        next_iter_seq_len=1,
+        scale_tensors=False,
     ):
+        # Range arbitrarily chosen to reproduce a numerical error on x86 in some of the long context tests
+        tensor_scale_max = 15
+        tensor_scale_min = -15
         self.n_heads_kv = n_heads_kv
         self.n_heads_q = n_heads_q
         self.head_dim = head_dim
         self.max_seq_len = max_seq_len
         self.setup_caches()
-        q = torch.rand((1, seq_len, self.n_heads_kv, self.head_dim))
-        k = torch.rand((1, seq_len, self.n_heads_kv, self.head_dim))
-        v = torch.rand((1, seq_len, self.n_heads_kv, self.head_dim))
+        q = self._scale_tensor(
+            torch.rand((self.n_batch, seq_len, self.n_heads_kv, self.head_dim)),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
+        k = self._scale_tensor(
+            torch.rand((self.n_batch, seq_len, self.n_heads_kv, self.head_dim)),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
+        v = self._scale_tensor(
+            torch.rand((self.n_batch, seq_len, self.n_heads_kv, self.head_dim)),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
+
         start_pos = 0
         attn_mask = self.mask[start_pos : start_pos + seq_len, :]
         attn_mask = attn_mask[:, : start_pos + seq_len]
@@ -412,11 +446,33 @@ def _test_sdpa_common(
         op_output = torch.ops.llama.sdpa_with_kv_cache(
             q, k, v, self.k_cache, self.v_cache, start_pos, seq_len, None, 0, True
         )
-        self.assertTrue(torch.allclose(ref_output, op_output))
+        self.assertTrue(torch.allclose(ref_output, op_output, atol=1e-6))
+
+        q = self._scale_tensor(
+            torch.rand(
+                (self.n_batch, next_iter_seq_len, self.n_heads_kv, self.head_dim)
+            ),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
+        k = self._scale_tensor(
+            torch.rand(
+                (self.n_batch, next_iter_seq_len, self.n_heads_kv, self.head_dim)
+            ),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
+        v = self._scale_tensor(
+            torch.rand(
+                (self.n_batch, next_iter_seq_len, self.n_heads_kv, self.head_dim)
+            ),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
 
-        q = torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim))
-        k = torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim))
-        v = torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim))
         start_pos = seq_len
         seq_len = q.size(1)
         attn_mask = self.mask[start_pos : start_pos + seq_len, :]
@@ -427,7 +483,7 @@ def _test_sdpa_common(
         op_output = torch.ops.llama.sdpa_with_kv_cache(
             q, k, v, self.k_cache, self.v_cache, start_pos, seq_len, None, 0, True
         )
-        self.assertTrue(torch.allclose(ref_output, op_output))
+        self.assertTrue(torch.allclose(ref_output, op_output, atol=1e-6))
 
 
 class SDPATestForLargeSeqLength(SDPATestCommon):
@@ -438,7 +494,9 @@ def test_sdpa_with_cache_seq_len_130(self):
         head_dim = 128
         max_seq_len = 2048
         seq_len = 130
-        self._test_sdpa_common(n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len)
+        self._test_sdpa_common(
+            n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, True
+        )
 
     def test_sdpa_with_cache_seq_len_small(self):
         n_heads_kv = 4
@@ -462,7 +520,9 @@ def test_sdpa_with_cache_seq_len_130_gqa(self):
         head_dim = 128
         max_seq_len = 2048
         seq_len = 130
-        self._test_sdpa_common(n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len)
+        self._test_sdpa_common(
+            n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, True
+        )
 
     def test_sdpa_with_cache_seq_len_llava_example_gqa(self):
         n_heads_kv = 16
@@ -483,7 +543,13 @@ def test_sdpa_with_cache_seq_len_130(self):
         seq_len = 130
         next_iter_seq_len = 17
         self._test_sdpa_common(
-            n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, next_iter_seq_len
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            seq_len,
+            next_iter_seq_len,
+            True,
         )
 
     def test_sdpa_with_cache_seq_len_llava_example(self):
@@ -505,7 +571,13 @@ def test_sdpa_with_cache_seq_len_130_gqa(self):
         seq_len = 130
         next_iter_seq_len = 33
         self._test_sdpa_common(
-            n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, next_iter_seq_len
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            seq_len,
+            next_iter_seq_len,
+            True,
         )
 
     def test_sdpa_with_cache_seq_len_llava_example_gqa(self):
diff --git a/extension/llm/export/TARGETS b/extension/llm/export/TARGETS
index 75f5cf937e8..be9bc183dbe 100644
--- a/extension/llm/export/TARGETS
+++ b/extension/llm/export/TARGETS
@@ -33,5 +33,6 @@ runtime.python_library(
         "//executorch/exir:lib",
         "//executorch/exir/backend:backend_details",
         "//executorch/extension/export_util:export_util",
+        "//executorch/extension/llm/tokenizer:tokenizer_py_lib",
     ],
 )
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 264e1e95ad3..91ee2dc733b 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -16,6 +16,7 @@
 from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
     DuplicateDynamicQuantChainPass,
 )
+from executorch.backends.xnnpack.passes.convert_to_linear import ConvertToLinearPass
 from executorch.exir import EdgeProgramManager
 from executorch.exir.backend.partitioner import Partitioner
 
@@ -27,6 +28,7 @@
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 
 from executorch.extension.export_util.utils import export_to_edge, save_pte_program
+from executorch.extension.llm.tokenizer.utils import get_tokenizer
 from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer import Quantizer
@@ -46,6 +48,7 @@ def to_torch_dtype(self) -> torch.dtype:
         mapping = {
             DType.fp32: torch.float32,
             DType.fp16: torch.float16,
+            DType.bf16: torch.bfloat16,
         }
         if self not in mapping:
             raise ValueError(f"Unsupported dtype {self}")
@@ -65,7 +68,14 @@ def __init__(
         dtype,
         use_kv_cache,
         example_inputs,
+        args: Optional[Any] = None,
         enable_dynamic_shape: bool = False,
+        generate_full_logits: bool = False,
+        calibration_tasks: Optional[List[str]] = None,
+        calibration_limit: Optional[int] = None,
+        calibration_seq_length: Optional[int] = None,
+        calibration_data: Optional[str] = None,
+        tokenizer_path: Optional[str] = None,
         verbose: bool = False,
         metadata: Optional[dict] = None,
         dynamic_shapes: Optional[Any] = None,
@@ -78,6 +88,7 @@ def __init__(
         self.dtype = dtype
         self.example_inputs = example_inputs
         self.use_kv_cache = use_kv_cache
+        self.generate_full_logits = generate_full_logits
         self.enable_dynamic_shape = enable_dynamic_shape
         self.verbose = verbose
         self.metadata = metadata
@@ -87,6 +98,12 @@ def __init__(
         self.output_dir = "."
         self.dynamic_shapes = dynamic_shapes
         self._saved_pte_filename = None
+        self.args = args
+        self.calibration_tasks = calibration_tasks
+        self.calibration_limit = calibration_limit
+        self.calibration_seq_length = calibration_seq_length
+        self.calibration_data = calibration_data
+        self.tokenizer_path = tokenizer_path
 
     def set_output_dir(self, output_dir: str) -> "LLMEdgeManager":
         """
@@ -161,11 +178,92 @@ def capture_pre_autograd_graph(self) -> "LLMEdgeManager":
         # 1. torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) is for bypassing the dynamo error when tracing
         # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
         with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
-            self.pre_autograd_graph_module = capture_pre_autograd_graph(
-                self.model, self.example_inputs, dynamic_shapes=dynamic_shape
-            )
+            # pyre-fixme[8]
+            if hasattr(self.args, "qnn") and self.args.qnn:
+                # TODO: this is temporary and export_for_training doesn't work with qnn either. We need a
+                # functional graph. See issue https://github.com/pytorch/executorch/pull/4627 for more details
+                self.pre_autograd_graph_module = torch.export.export(
+                    self.model,
+                    self.example_inputs,
+                    dynamic_shapes=dynamic_shape,
+                    strict=True,
+                ).module()
+            else:
+                self.pre_autograd_graph_module = capture_pre_autograd_graph(
+                    self.model, self.example_inputs, dynamic_shapes=dynamic_shape
+                )
+
         return self
 
+    def pt2e_calibrate(
+        self,
+        prepared_module,
+        calibration_tasks,
+        calibration_limit,
+        calibration_seq_length,
+        calibration_data,
+        tokenizer_path,
+    ):
+        logging.info("Run calibration...")
+        try:
+            from executorch.examples.models.llama2.eval_llama_lib import (
+                GraphModuleEvalWrapper,
+            )
+            from executorch.examples.models.llama2.evaluate import evaluate_model
+        except ImportError:
+            raise ImportError(
+                "Please install the llm eval dependency via examples/models/llama2/install_requirements.sh"
+            )
+
+        tokenizer = get_tokenizer(tokenizer_path)
+
+        def calibrate_template(
+            module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int
+        ):
+            # TODO: change criteria & support batch inputs if necessary
+            pos = torch.tensor(0, dtype=torch.int64)
+            token_list = tokenizer.encode(prompts, bos=True, eos=False)
+
+            with torch.no_grad():
+                while token_list[-1] != tokenizer.eos_id and pos < max_len:
+                    logits = module(
+                        torch.full((1, 1), token_list[pos]),
+                        torch.tensor((pos,)),
+                    )
+                    pos += 1
+                    if pos >= len(token_list):
+                        if self.generate_full_logits:
+                            token_list.append(
+                                torch.argmax(logits[:, -1], dim=-1).item()
+                            )
+                        else:
+                            token_list.append(torch.argmax(logits[:], dim=-1).item())
+
+        calibrate_template(
+            module=prepared_module,
+            tokenizer=tokenizer,
+            prompts=calibration_data,
+            max_len=calibration_seq_length,
+        )
+
+        eval_wrapper = GraphModuleEvalWrapper(
+            model=prepared_module,
+            tokenizer=tokenizer,
+            max_seq_length=calibration_seq_length,
+            use_kv_cache=self.use_kv_cache,
+            generate_full_logits=self.generate_full_logits,
+            enable_dynamic_shape=self.enable_dynamic_shape,
+        )
+        eval_results = evaluate_model(
+            eval_wrapper,
+            calibration_tasks,
+            calibration_limit,
+        )
+
+        for task, res in eval_results["results"].items():
+            print(f"{task}: {res}")
+        logging.info("Calibration finish...")
+
     def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManager":
         """
         Quantize the model via pt2e flow and retrieve LLMEdgeManager including the quantized model.
@@ -188,8 +286,33 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                     self.pre_autograd_graph_module is not None
                 ), "Please run capture_pre_autograd_graph first"
                 m = prepare_pt2e(self.pre_autograd_graph_module, composed_quantizer)
+                logging.info(
+                    f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}"
+                )
                 # Calibrate
-                m(*self.example_inputs)
+                if (
+                    self.calibration_tasks is not None
+                    and self.calibration_limit is not None
+                    and self.calibration_seq_length is not None
+                    and self.calibration_data is not None
+                    and self.tokenizer_path is not None
+                ):
+                    logging.info(
+                        f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}"
+                    )
+                    self.pt2e_calibrate(
+                        prepared_module=m,
+                        calibration_tasks=self.calibration_tasks,
+                        calibration_limit=self.calibration_limit,
+                        calibration_seq_length=self.calibration_seq_length,
+                        calibration_data=self.calibration_data,
+                        tokenizer_path=self.tokenizer_path,
+                    )
+                else:
+                    logging.info(
+                        "No calibration provided, using dummy input to calibrate..."
+                    )
+                    m(*self.example_inputs)
                 m = convert_pt2e(m)
                 DuplicateDynamicQuantChainPass()(m)
                 self.pre_autograd_graph_module = m
@@ -209,11 +332,10 @@ def export_to_edge(self) -> "LLMEdgeManager":
         # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
         with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
             if self.pre_autograd_graph_module is None:
-                self.pre_autograd_graph_module = capture_pre_autograd_graph(
-                    self.model, self.example_inputs, dynamic_shapes=dynamic_shape
-                )
+                # Run capture_pre_autograd_graph if it didn't run
+                self.capture_pre_autograd_graph()
             self.edge_manager = export_to_edge(
-                self.pre_autograd_graph_module,
+                self.pre_autograd_graph_module,  # pyre-fixme[6]
                 self.example_inputs,
                 dynamic_shapes=dynamic_shape,
                 edge_constant_methods=self.metadata,
@@ -227,7 +349,7 @@ def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManag
         Partition the model and lower to different backends. The signature is
         aligned with the signature of `to_backend` method of EdgeManager.
         Args:
-            partitioner (Optional[Partitioner]): One or more
+            partitioners (Optional[List[Partitioner]]): One or more
                 partitioner to be sent to EdgeManager.to_backend().
         """
         if partitioners is None:
@@ -259,14 +381,15 @@ def to_executorch(self) -> "LLMEdgeManager":
         assert self.edge_manager, "Need to run export_to_edge() first"
         self.export_program = self.edge_manager.to_executorch(
             ExecutorchBackendConfig(
-                extract_constant_segment=True,
                 extract_delegate_segments=True,
                 passes=[
+                    # If there are Linear operations left in the graph, let's execute
+                    # them with the optimized op_linear rather than materializing a
+                    # transpose followed by a regular op_mm.
+                    ConvertToLinearPass(),
                     QuantFusionPass(),
                 ],
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
                 sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
             )
         )
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index 501ef6fa6bb..bba16dd8a4d 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -52,15 +52,15 @@ def get_mps_partitioner(use_kv_cache: bool = False):
         )
 
     compile_specs = [CompileSpec("use_fp16", bytes([True]))]
-    return MPSPartitioner(compile_specs)
+    return MPSPartitioner(compile_specs)  # pyre-fixme[16]
 
 
 def get_coreml_partitioner(
-    use_kv_cache: bool = False, pt2e_quantize: Optional[str] = None
+    ios: int = 15,
+    embedding_quantize: Optional[str] = None,
+    pt2e_quantize: Optional[str] = None,
+    coreml_quantize: Optional[str] = None,
 ):
-    assert (
-        use_kv_cache is True
-    ), "CoreML backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment"
     try:
         import coremltools as ct
         from executorch.backends.apple.coreml.compiler import (  # pyre-ignore
@@ -74,38 +74,73 @@ def get_coreml_partitioner(
             "Please install the CoreML backend follwing https://pytorch.org/executorch/main/build-run-coreml.html"
         )
 
-    minimum_deployment_target = ct.target.iOS15
-    # In Core ML, quantization in introduced in iOS 16
-    if pt2e_quantize is not None:
-        minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS16)
-    # In Core ML, 8-bit activation quantization is introduced in iOS 17
-    if pt2e_quantize in ("coreml_8a_c8w", "coreml_baseline_8a_c8w"):
-        minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17)
-    # In Core ML, 4-bit weight compression is introduced in iOS 18
-    if pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"):
-        minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
-    # In Core ML, stateful execution is introduced in iOS 18
-    # TODO (https://github.com/pytorch/executorch/issues/4209)
-    # For now, since mutable buffer is kept in executorch runtime,
-    # state is out of place and can be handled by older iOS.
-    # Once mutable buffer can be handed over to delegate, i.e. state becomes in-place, we will have
-    # if use_kv_cache:
-    #     minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
-
-    compile_specs = CoreMLBackend.generate_compile_specs(
+    def _validate_ios_version() -> None:
+        assert ios in (15, 16, 17, 18)
+
+        if embedding_quantize is not None and ios < 18:
+            raise ValueError(
+                "In Core ML, per-block quantization is introduced in iOS 18"
+            )
+
+        use_quantization = pt2e_quantize is not None or coreml_quantize is not None
+        if use_quantization and ios < 16:
+            raise ValueError("In Core ML, quantization is introduced in iOS 16")
+
+        use_8a = (pt2e_quantize is not None and "8a" in pt2e_quantize) or (
+            coreml_quantize is not None and "8a" in coreml_quantize
+        )
+        if use_8a and ios < 17:
+            raise ValueError(
+                "In Core ML, 8-bit activation quantization is introduced in iOS 17"
+            )
+
+        use_4w = (pt2e_quantize is not None and "4w" in pt2e_quantize) or (
+            coreml_quantize is not None and "4w" in coreml_quantize
+        )
+        if use_4w and ios < 18:
+            raise ValueError(
+                "In Core ML, 4-bit weight compression is introduced in iOS 18"
+            )
+
+    _validate_ios_version()
+
+    minimum_deployment_target = {
+        15: ct.target.iOS15,
+        16: ct.target.iOS16,
+        17: ct.target.iOS17,
+        18: ct.target.iOS18,
+    }[ios]
+    op_linear_quantizer_config = None
+    if coreml_quantize == "b4w":
+        op_linear_quantizer_config = {
+            "mode": "linear_symmetric",
+            "dtype": "int4",
+            "granularity": "per_block",
+            "block_size": 32,
+            "weight_threshold": 512,
+        }
+    compile_specs = CoreMLBackend.generate_compile_specs(  # pyre-fixme[16]
         minimum_deployment_target=minimum_deployment_target,
         compute_precision=ct.precision(ct.precision.FLOAT16.value),
         # using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
         compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()],
-        model_type=CoreMLBackend.MODEL_TYPE.MODEL,
+        model_type=CoreMLBackend.MODEL_TYPE.MODEL,  # pyre-fixme[16]
+        op_linear_quantizer_config=op_linear_quantizer_config,
     )
-    return CoreMLPartitioner(
+
+    take_over_mutable_buffer = minimum_deployment_target >= ct.target.iOS18
+
+    return CoreMLPartitioner(  # pyre-fixme[16]
         compile_specs=compile_specs,
+        take_over_mutable_buffer=take_over_mutable_buffer,
     )
 
 
 def get_qnn_partitioner(
-    quant_dtype, use_kv_cache: bool = False, pt2e_quantize: Optional[str] = None
+    use_kv_cache: bool = False,
+    pt2e_quantize: Optional[str] = None,
+    num_sharding: int = 0,
+    soc_model: str = "SM8650",  # default to SM8650
 ):
     assert (
         use_kv_cache is True
@@ -128,18 +163,22 @@ def get_qnn_partitioner(
         )
     except ImportError:
         raise ImportError(
-            "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html"
+            "Please install the Qualcomm backend following https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html"
         )
 
     use_fp16 = True
-    skip_node_op_set = {}
+    skip_node_op_set = {"llama.fallback.default", "aten.embedding.default"}
     if pt2e_quantize is not None:
         use_fp16 = False
 
-    return QnnPartitioner(
-        generate_qnn_executorch_compiler_spec(
-            soc_model=QcomChipset.SM8650,  # default to SM8650
-            backend_options=generate_htp_compiler_spec(use_fp16=use_fp16),
+    return QnnPartitioner(  # pyre-fixme[16]
+        generate_qnn_executorch_compiler_spec(  # pyre-fixme[16]
+            soc_model=getattr(QcomChipset, soc_model),  # pyre-fixme[16]
+            # pyre-fixme[16]
+            backend_options=generate_htp_compiler_spec(
+                use_fp16=use_fp16,
+                use_multi_contexts=num_sharding > 0,
+            ),
             debug=False,
             saver=False,
         ),
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 8514e5d2558..45d9932724e 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -146,7 +146,7 @@ def get_qnn_quantizer(
     quantization_mode: Optional[str] = None,
 ):
     try:
-        from executorch.backends.qualcomm.quantizer.custom_annotation import (
+        from executorch.backends.qualcomm.quantizer.custom_annotation import (  # pyre-fixme[21]
             custom_annotate_llama_matmul_16a8w,
         )
 
@@ -168,15 +168,22 @@ def get_qnn_quantizer(
     assert (
         backend == "qnn"
     ), f"The quantization config is for backend {backend} instead of qnn."
-    qnn_quantizer = QnnQuantizer()
+    qnn_quantizer = QnnQuantizer()  # pyre-fixme[16]
     qnn_quantizer.set_per_channel_conv_quant(enable=True)
     qnn_quantizer.set_per_channel_linear_quant(enable=True)
     # more custom quantization are supported including 16a4w etc. default to 8bit quantized
     custom_annotations = ()
     if quant_config == "8a8w":
-        quant_dtype = QuantDtype.use_8a8w
+        quant_dtype = QuantDtype.use_8a8w  # pyre-fixme[16]
     elif quant_config == "16a16w":
-        quant_dtype = QuantDtype.use_16a16w
+        quant_dtype = QuantDtype.use_16a16w  # pyre-fixme[16]
+        # Due to the error with 16a16w in Qnn Htp, we need to disable per channel linear quantization when use 16a16w
+        # TODO: enable it after the issue is fixed
+        logging.warning(
+            "Disable per channel quantization for linear and conv due to the error with QNN HTP 16a16w."
+        )
+        qnn_quantizer.set_per_channel_conv_quant(enable=False)
+        qnn_quantizer.set_per_channel_linear_quant(enable=False)
         qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS)
         qnn_quantizer.set_bit16_op_quant_config(
             # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
@@ -202,6 +209,12 @@ def get_qnn_quantizer(
         quantization_mode is None
     ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
     qnn_quantizer.add_custom_quant_annotations(custom_annotations)
+    qnn_quantizer.add_discard_ops(
+        [
+            torch.ops.aten.embedding.default,
+        ]
+    )
+
     return qnn_quantizer, quant_dtype
 
 
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index 27bc84fe115..a9245768b9d 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -43,7 +43,9 @@ target_include_directories(
 
 add_library(extension_llm_runner STATIC ${_extension_llm_runner__srcs})
 
-set(runner_deps executorch extension_module extension_data_loader)
+set(runner_deps executorch extension_data_loader extension_module
+                extension_tensor
+)
 
 target_link_libraries(extension_llm_runner PUBLIC ${runner_deps})
 
diff --git a/extension/llm/runner/image.h b/extension/llm/runner/image.h
index e18353dda9a..32a9f878187 100644
--- a/extension/llm/runner/image.h
+++ b/extension/llm/runner/image.h
@@ -13,7 +13,9 @@
 // patternlint-disable-next-line executorch-cpp-nostdinc
 #include <vector>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 struct Image {
   // Assuming NCHW format
@@ -23,4 +25,14 @@ struct Image {
   int32_t channels;
 };
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::Image;
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/runner/image_prefiller.h b/extension/llm/runner/image_prefiller.h
index 64b623be36f..93bb9a030bb 100644
--- a/extension/llm/runner/image_prefiller.h
+++ b/extension/llm/runner/image_prefiller.h
@@ -13,23 +13,28 @@
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/extension/module/module.h>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 // Assuming kv cache and parallel prefill are enabled.
 class ImagePrefiller {
  public:
-  explicit ImagePrefiller(Module* module) : module_(module) {}
+  explicit ImagePrefiller(::executorch::extension::Module* module)
+      : module_(module) {}
+
   /**
    * Prefill an LLM Module with the given image input.
    * @param image The image input to the multimodal LLM.
-   * @param start_pos The starting position in KV cache of the input in the LLM
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * It's passed as reference and will be updated inside this function.
    * @return The next token of the LLM Module after prefill.
    */
-  virtual Result<exec_aten::Tensor> prefill(
+  virtual ::executorch::runtime::Result<exec_aten::Tensor> prefill(
       Image& image,
-      int64_t start_pos = 0) = 0;
+      int64_t& start_pos) = 0;
 
-  virtual Error load() = 0;
+  virtual ::executorch::runtime::Error load() = 0;
   virtual bool is_method_loaded() = 0;
 
   virtual ~ImagePrefiller() = default;
@@ -38,4 +43,14 @@ class ImagePrefiller {
   Module* module_;
 };
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::ImagePrefiller;
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/runner/metadata_util.h b/extension/llm/runner/metadata_util.h
deleted file mode 100644
index 4ea2d9eebd5..00000000000
--- a/extension/llm/runner/metadata_util.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/**
- * Constant metadata can be serialized in .pte files, this helper enables
- * easy access to the metadata.
- */
-#pragma once
-
-#include <executorch/extension/module/module.h>
-
-namespace torch::executor {
-template <typename T>
-T get_module_metadata(
-    Module* module,
-    const std::string& method_name,
-    T default_val) {
-  const auto method_names = module->method_names();
-  ET_CHECK_MSG(method_names.ok(), "Failed to read method names from model");
-  auto model_methods = method_names.get();
-
-  T res = default_val;
-  if (model_methods.count(method_name)) {
-    Result<std::vector<EValue>> outputs = module->execute(method_name);
-    if (outputs.ok()) {
-      std::vector<EValue> outs = outputs.get();
-      if (outs.size() > 0) {
-        res = outs[0].to<T>();
-      }
-    }
-  } else {
-    ET_LOG(
-        Info,
-        "The model does not contain %s method, using default value %lld",
-        method_name.c_str(),
-        (long long)default_val);
-  }
-  ET_LOG(Info, "%s: %lld", method_name.c_str(), (long long)res);
-  return res;
-}
-} // namespace torch::executor
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
index ac38085be44..6798f648a0c 100644
--- a/extension/llm/runner/multimodal_runner.h
+++ b/extension/llm/runner/multimodal_runner.h
@@ -31,10 +31,10 @@
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 
-namespace torch::executor {
-using Stats = ::executorch::llm::Stats;
+namespace executorch {
+namespace extension {
+namespace llm {
 
 class MultimodalRunner {
  public:
@@ -53,13 +53,60 @@ class MultimodalRunner {
   }
 
   virtual bool is_loaded() = 0;
-  virtual Error load() = 0;
-  virtual Error generate(
+  virtual ::executorch::runtime::Error load() = 0;
+  virtual ::executorch::runtime::Error generate(
+      std::vector<Image> images,
+      const std::string& prompt,
+      int32_t seq_len = 1024,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const Stats&)> stats_callback = {},
+      bool echo = true) = 0;
+
+  /**
+   * Prefill an LLaVA Module with the given images input.
+   * @param images The image input to LLaVA.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * It's passed as reference and will be updated inside this function.
+   * @return The error status of prefilling images.
+   */
+  virtual runtime::Error prefill_images(
       std::vector<Image>& images,
+      int64_t& start_pos) = 0;
+
+  /**
+   * Prefill an LLaVA Module with the given text input.
+   * @param prompt The text prompt to LLaVA.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * It's passed as reference and will be updated inside this function.
+   * @param bos The number of BOS (begin of sequence) token.
+   * @param eos The number of EOS (end of sequence) token.
+   * @return The generated token of the LLaVA Module after prefill prompt.
+   */
+  virtual runtime::Result<uint64_t> prefill_prompt(
+      const std::string& prompt,
+      int64_t& start_pos,
+      int8_t bos = 0,
+      int8_t eos = 0) = 0;
+
+  /**
+   * Generate tokens from the given prompt, starting from the given position.
+   * @param prompt The text prompt to LLaVA.
+   * @param seq_len The total sequence length, including the prompt tokens and
+   * new tokens.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * @param token_callback What to do after a token is generated.
+   * @param stats_callback What to do with Stats.
+   * @param echo Whether to echo the input prompt or not.
+   * @return The error code.
+   */
+  virtual runtime::Error generate_from_pos(
       const std::string& prompt,
       int32_t seq_len = 1024,
+      int64_t start_pos = 0,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {}) = 0;
+      std::function<void(const ::executorch::extension::llm::Stats&)>
+          stats_callback = {},
+      bool echo = true) = 0;
 
   inline void stop() {
     text_token_generator_->stop();
@@ -91,4 +138,14 @@ class MultimodalRunner {
   Stats stats_;
 };
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::MultimodalRunner;
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h
index f62be0940c8..902ba892966 100644
--- a/extension/llm/runner/stats.h
+++ b/extension/llm/runner/stats.h
@@ -14,7 +14,10 @@
 #include <sstream>
 // patternlint-disable-next-line executorch-cpp-nostdinc
 #include <string>
-namespace executorch::llm {
+
+namespace executorch {
+namespace extension {
+namespace llm {
 
 struct Stats {
   // Scaling factor for timestamps - in this case, we use ms.
@@ -41,12 +44,11 @@ struct Stats {
   // Token count from generated (total - prompt)
   int64_t num_generated_tokens;
   inline void on_sampling_begin() {
-    aggregate_sampling_timer_start_timestamp =
-        ::torch::executor::util::time_in_ms();
+    aggregate_sampling_timer_start_timestamp = time_in_ms();
   }
   inline void on_sampling_end() {
-    aggregate_sampling_time_ms += ::torch::executor::util::time_in_ms() -
-        aggregate_sampling_timer_start_timestamp;
+    aggregate_sampling_time_ms +=
+        time_in_ms() - aggregate_sampling_timer_start_timestamp;
     aggregate_sampling_timer_start_timestamp = 0;
   }
 
@@ -132,4 +134,16 @@ inline void print_report(const Stats& stats) {
           stats.SCALING_FACTOR_UNITS_PER_SECOND);
 }
 
-} // namespace executorch::llm
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace executorch {
+namespace llm {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::kTopp;
+using ::executorch::extension::llm::print_report;
+using ::executorch::extension::llm::Stats;
+} // namespace llm
+} // namespace executorch
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index 1a2fa252c8f..f20240956cb 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -26,7 +26,7 @@ def define_common_targets():
                 ":stats",
                 "//executorch/extension/llm/sampler:sampler" + aten_suffix,
                 "//executorch/extension/module:module" + aten_suffix,
-                "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
+                "//executorch/extension/tensor:tensor" + aten_suffix,
             ],
         )
 
@@ -41,7 +41,7 @@ def define_common_targets():
                 ":text_decoder_runner" + aten_suffix,
                 "//executorch/extension/llm/tokenizer:tokenizer_header",
                 "//executorch/extension/module:module" + aten_suffix,
-                "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
+                "//executorch/extension/tensor:tensor" + aten_suffix,
             ],
         )
 
@@ -55,7 +55,7 @@ def define_common_targets():
                 ":text_decoder_runner" + aten_suffix,
                 "//executorch/extension/llm/tokenizer:tokenizer_header",
                 "//executorch/extension/module:module" + aten_suffix,
-                "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
+                "//executorch/extension/tensor:tensor" + aten_suffix,
             ],
         )
 
@@ -70,17 +70,6 @@ def define_common_targets():
             ],
         )
 
-        runtime.cxx_library(
-            name = "metadata_util" + aten_suffix,
-            exported_headers = ["metadata_util.h"],
-            visibility = [
-                "@EXECUTORCH_CLIENTS",
-            ],
-            exported_deps = [
-                "//executorch/extension/module:module" + aten_suffix,
-            ],
-        )
-
         runtime.cxx_library(
             name = "runner_lib" + aten_suffix,
             exported_headers = [
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
index 3de75ceccb4..faf4d1344e1 100644
--- a/extension/llm/runner/text_decoder_runner.cpp
+++ b/extension/llm/runner/text_decoder_runner.cpp
@@ -8,11 +8,15 @@
 
 // Given inputs, run a text decoder and return logits.
 
-#include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
+
 #include <ctime>
 
-namespace torch::executor {
+#include <executorch/extension/llm/runner/stats.h>
+
+namespace executorch {
+namespace extension {
+namespace llm {
 
 // NOTE: we observed ~2x loading performance increase on iPhone 15
 // and a ~5% improvement on Galaxy S22 by switching to
@@ -26,22 +30,19 @@ TextDecoderRunner::TextDecoderRunner(
       sampler_(std::make_unique<Sampler>(
           vocab_size,
           temperature,
-          ::executorch::llm::kTopp,
+          kTopp,
           static_cast<unsigned long long>(std::time(nullptr)))),
       use_kv_cache_(use_kv_cache) {}
 
 // This function is functional, meaning it shouldn't modify any state of the
 // input. It should be safe to call multiple times with the same inputs. The
 // outer loop (call site) is responsible for managing state.
-Result<exec_aten::Tensor> TextDecoderRunner::step(
-    ManagedTensor& managed_tokens,
-    ManagedTensor& managed_start_pos) {
-  auto tokens = managed_tokens.get_aliasing_tensor();
+::executorch::runtime::Result<exec_aten::Tensor> TextDecoderRunner::step(
+    TensorPtr& tokens,
+    TensorPtr& start_pos) {
   // ET_LOG(Info, "Input token %" PRIu64, input_token);
   if (use_kv_cache_) {
-    auto start_pos = managed_start_pos.get_aliasing_tensor();
-    Result<std::vector<EValue>> outputs_res =
-        module_->forward({tokens, start_pos});
+    auto outputs_res = module_->forward({tokens, start_pos});
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_CHECK_MSG(
         outputs_res.get().size() == 1,
@@ -53,9 +54,9 @@ Result<exec_aten::Tensor> TextDecoderRunner::step(
     // Return the logits tensor
     return outputs_res.get()[0].toTensor();
   } else { // no kv cache
-    (void)managed_start_pos; // unused
+    (void)start_pos; // unused
 
-    Result<std::vector<EValue>> outputs_res = module_->forward({tokens});
+    auto outputs_res = module_->forward(tokens);
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_CHECK_MSG(
         outputs_res.get().size() == 1,
@@ -69,4 +70,6 @@ Result<exec_aten::Tensor> TextDecoderRunner::step(
   }
 }
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
index 49ddea66299..14614775f3a 100644
--- a/extension/llm/runner/text_decoder_runner.h
+++ b/extension/llm/runner/text_decoder_runner.h
@@ -12,11 +12,13 @@
 
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/module/module.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/extension/tensor/tensor.h>
 // patternlint-disable-next-line executorch-cpp-nostdinc
 #include <functional>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 class TextDecoderRunner {
  public:
@@ -35,15 +37,15 @@ class TextDecoderRunner {
    * Module.
    * @return The output of the LLM Module. This will be a tensor of logits.
    */
-  virtual Result<exec_aten::Tensor> step(
-      ManagedTensor& input,
-      ManagedTensor& start_pos);
+  virtual ::executorch::runtime::Result<exec_aten::Tensor> step(
+      TensorPtr& input,
+      TensorPtr& start_pos);
 
   /**
    * Load the Module for text decode purpose.
    * @return The error code.
    */
-  virtual Error load() {
+  virtual ::executorch::runtime::Error load() {
     return module_->load_method("forward");
   }
 
@@ -65,30 +67,31 @@ class TextDecoderRunner {
    * @return The next token.
    */
   inline int32_t logits_to_token(const exec_aten::Tensor& logits_tensor) {
-    ET_CHECK_MSG(logits_tensor.dim() == 3, "Logits tensor must be 3D");
-    auto num_tokens = logits_tensor.size(1);
-    auto vocab_size = logits_tensor.size(2);
-
-    switch (logits_tensor.scalar_type()) {
-      case ScalarType::Float: {
-        float* logits = logits_tensor.mutable_data_ptr<float>();
-        float* logits_last = logits;
-        logits_last += (num_tokens - 1) * vocab_size;
-        return sampler_->sample(logits_last);
-      }
-      case ScalarType::Half: {
-        exec_aten::Half* logits =
-            logits_tensor.mutable_data_ptr<exec_aten::Half>();
-        exec_aten::Half* logits_last = logits;
-        logits_last += (num_tokens - 1) * vocab_size;
-        return sampler_->sample(logits_last);
-      }
-      default:
-        ET_CHECK_MSG(
-            false,
-            "Unsupported dtype output %hhd",
-            static_cast<int8_t>(logits_tensor.scalar_type()));
-    }
+    int32_t result = 0;
+    ET_SWITCH_THREE_TYPES(
+        Float,
+        Half,
+        BFloat16,
+        logits_tensor.scalar_type(),
+        unused,
+        "logits_to_token",
+        CTYPE,
+        [&]() {
+          // If the logit_tensor rank is 3, the shape is [batch, seq_length,
+          // vocab_size], get the last logits, sample and return. Else the model
+          // outputs the last logit, directly sample and return.
+          auto* logits = logits_tensor.mutable_data_ptr<CTYPE>();
+          if (logits_tensor.dim() == 3) {
+            auto num_tokens = logits_tensor.size(1);
+            auto vocab_size = logits_tensor.size(2);
+            auto* logits_last = logits;
+            logits_last += (num_tokens - 1) * vocab_size;
+            result = sampler_->sample(logits_last);
+          } else {
+            result = sampler_->sample(logits);
+          }
+        });
+    return result;
   }
 
  protected:
@@ -99,4 +102,14 @@ class TextDecoderRunner {
   bool should_stop_{false};
 };
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::TextDecoderRunner;
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
index beafb21434d..705583d638b 100644
--- a/extension/llm/runner/text_prefiller.cpp
+++ b/extension/llm/runner/text_prefiller.cpp
@@ -11,22 +11,21 @@
 
 #include <executorch/extension/llm/runner/text_prefiller.h>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 TextPrefiller::TextPrefiller(
-    Tokenizer* tokenizer,
     TextDecoderRunner* text_decoder_runner,
     bool use_kv_cache,
     bool enable_parallel_prefill)
-    : tokenizer_(tokenizer),
-      text_decoder_runner_(text_decoder_runner),
+    : text_decoder_runner_(text_decoder_runner),
       use_kv_cache_(use_kv_cache),
       enable_parallel_prefill_(enable_parallel_prefill) {}
 
-Result<uint64_t> TextPrefiller::prefill(
+::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     std::vector<uint64_t>& prompt_tokens,
-    int64_t start_pos,
-    std::function<void(const std::string&)> token_callback) {
+    int64_t& start_pos) {
   ET_CHECK_MSG(!prompt_tokens.empty(), "Prompt cannot be null");
   if (!text_decoder_runner_->is_method_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load());
@@ -39,74 +38,51 @@ Result<uint64_t> TextPrefiller::prefill(
   uint64_t cur_token;
   if (enable_parallel_prefill_ || !use_kv_cache_) {
     // initialize tensor wrappers
-    ManagedTensor managed_tokens(
-        prompt_tokens.data(), {1, num_prompt_tokens}, ScalarType::Long);
+    auto tokens = from_blob(
+        prompt_tokens.data(),
+        {1, num_prompt_tokens},
+        exec_aten::ScalarType::Long);
 
-    ManagedTensor managed_start_pos(&start_pos, {1}, ScalarType::Long);
+    auto start_pos_tensor =
+        from_blob(&start_pos, {1}, exec_aten::ScalarType::Long);
 
-    Result<exec_aten::Tensor> outputs_res =
-        text_decoder_runner_->step(managed_tokens, managed_start_pos);
+    auto outputs_res = text_decoder_runner_->step(tokens, start_pos_tensor);
 
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_LOG(
         Info, "Prefill token result numel(): %zu", outputs_res.get().numel());
-    ET_CHECK_MSG(
-        outputs_res.get().size(1) == num_prompt_tokens,
-        "Expected number of output tokens %d does not match returned value %zu.",
-        num_prompt_tokens,
-        outputs_res.get().size(1));
-    // insert new token into prompt_tokens
-    // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
-    uint64_t prev = prompt_tokens[0];
-    uint64_t cur;
-    for (int i = 0; i < prompt_tokens.size(); i++) {
-      cur = prompt_tokens[i];
-      if (token_callback && cur != tokenizer_->bos_tok()) {
-        token_callback(ET_UNWRAP(tokenizer_->decode(prev, cur)));
-      }
-      prev = cur;
-    }
+
+    start_pos += num_prompt_tokens;
     cur_token = text_decoder_runner_->logits_to_token(outputs_res.get());
   } else { // sequential prefill
     int64_t pos = 0; // position in the sequence
-    int64_t prev_token;
-    // token & pos
-    int64_t pos_data = 0;
     // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
     cur_token = prompt_tokens[0];
 
     // initialize tensor wrappers
-    ManagedTensor managed_tokens(&cur_token, {1, 1}, ScalarType::Long);
+    auto tokens = from_blob(&cur_token, {1, 1}, exec_aten::ScalarType::Long);
 
-    ManagedTensor managed_start_pos(&pos_data, {1}, ScalarType::Long);
+    auto start_pos_tensor =
+        from_blob(&start_pos, {1}, exec_aten::ScalarType::Long);
 
     // run the first token and get back logits tensor. Assuming the first token
     // is bos so don't callback.
-    exec_aten::Tensor logits_tensor = ET_UNWRAP(
-        text_decoder_runner_->step(managed_tokens, managed_start_pos));
+    auto logits_tensor =
+        ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor));
 
-    // if first token is not bos, we need to callback
-    if (cur_token != tokenizer_->bos_tok()) {
-      token_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
-    }
-    pos = 1; // start from index 1
+    pos += 1; // start the loop from index 1
+    start_pos += 1;
 
     while (pos < num_prompt_tokens) {
       // Run the model
-      pos_data = start_pos + pos;
-
-      prev_token = cur_token;
-
       // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
       cur_token = prompt_tokens[pos];
 
-      logits_tensor = ET_UNWRAP(
-          text_decoder_runner_->step(managed_tokens, managed_start_pos));
-
-      // print the token as string, decode it with the Tokenizer object
-      token_callback(ET_UNWRAP(tokenizer_->decode(prev_token, cur_token)));
+      logits_tensor =
+          ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor));
 
       pos++;
+      start_pos++;
     }
 
     cur_token = text_decoder_runner_->logits_to_token(logits_tensor);
@@ -114,4 +90,6 @@ Result<uint64_t> TextPrefiller::prefill(
   return cur_token;
 }
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
index 7293fdca2a4..0ea126f32d6 100644
--- a/extension/llm/runner/text_prefiller.h
+++ b/extension/llm/runner/text_prefiller.h
@@ -16,12 +16,13 @@
 // patternlint-disable-next-line executorch-cpp-nostdinc
 #include <functional>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 class TextPrefiller {
  public:
   TextPrefiller(
-      Tokenizer* tokenizer,
       TextDecoderRunner* text_decoder_runner,
       bool use_kv_cache_,
       bool enable_parallel_prefill);
@@ -31,20 +32,26 @@ class TextPrefiller {
    * tokenizer.
    * @param start_pos The starting position in KV cache of the input in the LLM
    * Module.
-   * @param token_callback A callback function that will be called for each
-   * token in the prompt.
    * @return The next token of the LLM Module after prefill.
    */
-  Result<uint64_t> prefill(
+  ::executorch::runtime::Result<uint64_t> prefill(
       std::vector<uint64_t>& prompt_tokens,
-      int64_t start_pos = 0,
-      std::function<void(const std::string&)> token_callback = {});
+      int64_t& start_pos);
 
  private:
-  Tokenizer* tokenizer_;
   TextDecoderRunner* text_decoder_runner_;
   bool use_kv_cache_;
   bool enable_parallel_prefill_;
 };
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::TextPrefiller;
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
index 9b3a31f3f71..1726750ece5 100644
--- a/extension/llm/runner/text_token_generator.h
+++ b/extension/llm/runner/text_token_generator.h
@@ -12,9 +12,11 @@
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
+#include <executorch/extension/tensor/tensor.h>
 
-namespace torch::executor {
-using Stats = ::executorch::llm::Stats;
+namespace executorch {
+namespace extension {
+namespace llm {
 
 class TextTokenGenerator {
  public:
@@ -41,7 +43,7 @@ class TextTokenGenerator {
    * @param token_callback what to do after a token is generated.
    * @return how many tokens are generated.
    */
-  inline Result<int64_t> generate(
+  inline ::executorch::runtime::Result<int64_t> generate(
       std::vector<uint64_t> tokens,
       int64_t start_pos,
       int32_t seq_len,
@@ -68,15 +70,15 @@ class TextTokenGenerator {
     }
 
     // initialize tensor wrappers
-    ManagedTensor tokens_managed(
-        token_data.data(), token_shape, ScalarType::Long);
+    auto tokens_managed =
+        from_blob(token_data.data(), token_shape, exec_aten::ScalarType::Long);
 
-    ManagedTensor start_pos_managed(&pos, {1}, ScalarType::Long);
+    auto start_pos_managed = from_blob(&pos, {1}, exec_aten::ScalarType::Long);
 
     // Generate our tokens
     while (pos < seq_len - 1) {
       // Run the model
-      Result<exec_aten::Tensor> logits_res =
+      auto logits_res =
           text_decoder_runner_->step(tokens_managed, start_pos_managed);
 
       ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error());
@@ -97,7 +99,8 @@ class TextTokenGenerator {
       } else {
         // push it to the back
         token_data.push_back(cur_token);
-        tokens_managed.resize({1, static_cast<int>(token_data.size())});
+        ET_CHECK_OK_OR_RETURN_ERROR(resize_tensor_ptr(
+            tokens_managed, {1, static_cast<int>(token_data.size())}));
       }
 
       // print the token as string, decode it with the Tokenizer object
@@ -136,4 +139,15 @@ class TextTokenGenerator {
   // stats
   Stats* stats_;
 };
-} // namespace torch::executor
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::TextTokenGenerator;
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h
index 5d4792b6414..2f1d084811e 100644
--- a/extension/llm/runner/util.h
+++ b/extension/llm/runner/util.h
@@ -10,10 +10,13 @@
 #include <stdio.h>
 #include <time.h>
 #include <cctype>
+#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__)
+#include <sys/resource.h>
+#endif
 
-namespace torch {
-namespace executor {
-namespace util {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 void inline safe_printf(const char* piece) {
   // piece might be a raw byte token, and we only want to print printable chars
@@ -44,6 +47,39 @@ long inline time_in_ms() {
   return time.tv_sec * 1000 + time.tv_nsec / 1000000;
 }
 
+// ----------------------------------------------------------------------------
+// utilities: memory usage
+
+// Returns the current RSS in bytes. Returns 0 if not supported.
+// RSS: Resident Set Size, the amount of memory currently in the RAM for this
+// process. These values are approximate, and are only used for logging
+// purposes.
+size_t inline get_rss_bytes() {
+#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__)
+  struct rusage r_usage;
+  if (getrusage(RUSAGE_SELF, &r_usage) == 0) {
+    return r_usage.ru_maxrss * 1024;
+  }
+#endif // __linux__ || __ANDROID__ || __unix__
+  // Unsupported platform like Windows, or getrusage() failed.
+  // __APPLE__ and __MACH__ are not supported because r_usage.ru_maxrss does not
+  // consistently return kbytes on macOS. On older versions of macOS, it
+  // returns bytes, but on newer versions it returns kbytes. Need to figure out
+  // when this changed.
+  return 0;
+}
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace util {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::get_rss_bytes;
+using ::executorch::extension::llm::safe_printf;
+using ::executorch::extension::llm::time_in_ms;
 } // namespace util
 } // namespace executor
 } // namespace torch
diff --git a/extension/llm/sampler/sampler.cpp b/extension/llm/sampler/sampler.cpp
index 6b0f155f120..f7342c48f70 100644
--- a/extension/llm/sampler/sampler.cpp
+++ b/extension/llm/sampler/sampler.cpp
@@ -35,8 +35,9 @@
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <algorithm>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 // sampler stuff
 template <typename T>
@@ -191,6 +192,9 @@ int32_t Sampler::sample(T* logits) {
 
 template int32_t Sampler::sample<float>(float* logits);
 template int32_t Sampler::sample<exec_aten::Half>(exec_aten::Half* logits);
+template int32_t Sampler::sample<exec_aten::BFloat16>(
+    exec_aten::BFloat16* logits);
 
-} // namespace executor
-} // namespace torch
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/sampler/sampler.h b/extension/llm/sampler/sampler.h
index 584a010bba2..9d6d742e590 100644
--- a/extension/llm/sampler/sampler.h
+++ b/extension/llm/sampler/sampler.h
@@ -20,8 +20,9 @@
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 // A simple llama2 sampler.
 
 template <typename T>
@@ -57,5 +58,15 @@ class Sampler {
   unsigned long long rng_state_;
 };
 
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::ProbIndex;
+using ::executorch::extension::llm::Sampler;
 } // namespace executor
 } // namespace torch
diff --git a/extension/llm/sampler/test/test_sampler.cpp b/extension/llm/sampler/test/test_sampler.cpp
index 2dac03d976a..044a39458ea 100644
--- a/extension/llm/sampler/test/test_sampler.cpp
+++ b/extension/llm/sampler/test/test_sampler.cpp
@@ -12,14 +12,10 @@
 #include <torch/torch.h>
 
 using namespace ::testing;
+using ::executorch::extension::llm::Sampler;
 
-namespace torch {
-namespace executor {
-
-class SamplerTest : public Test {};
-
-TEST_F(SamplerTest, TestArgMax) {
-  torch::executor::Sampler sampler{
+TEST(SamplerTest, TestArgMax) {
+  Sampler sampler{
       /*vocab_size*/ 32000,
       /*temperature*/ 0.0f,
       /*topp*/ 0.9f,
@@ -31,8 +27,8 @@ TEST_F(SamplerTest, TestArgMax) {
   EXPECT_EQ(sampler.sample(input.data_ptr<float>()), 396);
 }
 
-TEST_F(SamplerTest, TestArgMaxWithFP16) {
-  torch::executor::Sampler sampler{
+TEST(SamplerTest, TestArgMaxWithFP16) {
+  Sampler sampler{
       /*vocab_size*/ 32000,
       /*temperature*/ 0.0f,
       /*topp*/ 0.9f,
@@ -43,6 +39,3 @@ TEST_F(SamplerTest, TestArgMaxWithFP16) {
   input[0][0][396] = 1.0f;
   EXPECT_EQ(sampler.sample(input.data_ptr<c10::Half>()), 396);
 }
-
-} // namespace executor
-} // namespace torch
diff --git a/extension/llm/third-party/TARGETS b/extension/llm/third-party/TARGETS
new file mode 100644
index 00000000000..978c12371fe
--- /dev/null
+++ b/extension/llm/third-party/TARGETS
@@ -0,0 +1,47 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+runtime.cxx_library(
+    name = "abseil",
+    public_include_directories = ["abseil-cpp"],
+    srcs = glob(
+        ["abseil-cpp/absl/**/*.cc"],
+        exclude = [
+            "abseil-cpp/absl/**/*test*.cc",
+            "abseil-cpp/absl/**/*mock*.cc",
+            "abseil-cpp/absl/**/*matchers*.cc",
+            "abseil-cpp/absl/**/*benchmark*.cc",
+        ],
+    ),
+    exported_linker_flags = select(
+        {
+            "DEFAULT": [],
+            "ovr_config//os:macos": ["-Wl,-framework,CoreFoundation"],
+        },
+    ),
+    visibility = ["PUBLIC"],
+    _is_external_target = True,
+)
+
+runtime.cxx_library(
+    name = "re2",
+    public_include_directories = ["re2"],
+    srcs = glob(
+        [
+            "re2/re2/**/*.cc",
+            "re2/util/**/*.cc",
+        ],
+        exclude = [
+            "re2/re2/**/*test*.cc",
+            "re2/re2/testing/*.cc",
+            "re2/re2/fuzzing/*.cc",
+            "re2/re2/**/*benchmark*.cc",
+        ],
+    ),
+    exported_deps = [
+        ":abseil",
+    ],
+    visibility = ["PUBLIC"],
+    _is_external_target = True,
+)
diff --git a/extension/llm/tokenizer/base64.h b/extension/llm/tokenizer/base64.h
index 9fb1b5129b3..83ef9e0696b 100644
--- a/extension/llm/tokenizer/base64.h
+++ b/extension/llm/tokenizer/base64.h
@@ -24,16 +24,23 @@
 
 #pragma once
 
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/assert.h>
 #include <cassert>
 #include <string>
 #include <string_view>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+namespace llm {
+using Error = executorch::runtime::Error;
+template <typename T>
+using Result = executorch::runtime::Result<T>;
+
 namespace base64 {
 
-std::string decode(const std::string_view& input);
+Result<std::string> decode(const std::string_view& input);
 
 namespace detail {
 
@@ -57,96 +64,111 @@ constexpr uint32_t DECODE_TABLE[] = {
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255};
 
-inline void validate(uint32_t v) {
-  ET_CHECK_MSG(v != 255, "invalid char");
+inline Error validate(uint32_t v) {
+  ET_CHECK_OR_RETURN_ERROR(v != 255, InvalidArgument, "invalid char");
+  return Error::Ok;
 }
 
-inline void decode(const std::string_view& input, std::string& output) {
-  ET_CHECK_MSG(
-      input.size() == 4, "input length must be 4, got %zu", input.size());
+inline Error decode(const std::string_view& input, std::string& output) {
+  ET_CHECK_OR_RETURN_ERROR(
+      input.size() == 4,
+      InvalidArgument,
+      "input length must be 4, got %zu",
+      input.size());
 
   uint32_t val = 0;
 
   uint8_t c = input[0];
   auto v = DECODE_TABLE[c];
-  validate(v);
+  ET_CHECK_OK_OR_RETURN_ERROR(validate(v));
   val = v;
 
   c = input[1];
   v = DECODE_TABLE[c];
-  validate(v);
+  ET_CHECK_OK_OR_RETURN_ERROR(validate(v));
   val = (val << 6) | v;
 
   c = input[2];
   v = DECODE_TABLE[c];
-  validate(v);
+  ET_CHECK_OK_OR_RETURN_ERROR(validate(v));
   val = (val << 6) | v;
 
   c = input[3];
   v = DECODE_TABLE[c];
-  validate(v);
+  ET_CHECK_OK_OR_RETURN_ERROR(validate(v));
   val = (val << 6) | v;
 
   output.push_back(static_cast<char>((val >> 16) & 0xFF));
   output.push_back(static_cast<char>((val >> 8) & 0xFF));
   output.push_back(static_cast<char>(val & 0xFF));
+  return Error::Ok;
 }
 
-inline void decode_1_padding(
+inline Error decode_1_padding(
     const std::string_view& input,
     std::string& output) {
-  ET_CHECK_MSG(
-      input.size() == 3, "input length must be 3, got %zu", input.size());
+  ET_CHECK_OR_RETURN_ERROR(
+      input.size() == 3,
+      InvalidArgument,
+      "input length must be 3, got %zu",
+      input.size());
 
   uint32_t val = 0;
 
   uint8_t c = input[0];
   auto v = DECODE_TABLE[c];
-  validate(v);
+  ET_CHECK_OK_OR_RETURN_ERROR(validate(v));
   val = v;
 
   c = input[1];
   v = DECODE_TABLE[c];
-  validate(v);
+  ET_CHECK_OK_OR_RETURN_ERROR(validate(v));
   val = (val << 6) | v;
 
   c = input[2];
   v = DECODE_TABLE[c];
-  validate(v);
+  ET_CHECK_OK_OR_RETURN_ERROR(validate(v));
   val = (val << 6) | v;
 
   output.push_back(static_cast<char>((val >> 10) & 0xFF));
   output.push_back(static_cast<char>((val >> 2) & 0xFF));
+  return Error::Ok;
 }
 
-inline void decode_2_padding(
+inline Error decode_2_padding(
     const std::string_view& input,
     std::string& output) {
-  assert(input.size() == 2);
+  ET_CHECK_OR_RETURN_ERROR(
+      input.size() == 2,
+      InvalidArgument,
+      "input length must be 2, got %zu",
+      input.size());
 
   uint32_t val = 0;
 
   uint8_t c = input[0];
   auto v = DECODE_TABLE[c];
-  validate(v);
+  ET_CHECK_OK_OR_RETURN_ERROR(validate(v));
   val = v;
 
   c = input[1];
   v = DECODE_TABLE[c];
-  validate(v);
+  ET_CHECK_OK_OR_RETURN_ERROR(validate(v));
   val = (val << 6) | v;
 
   output.push_back(static_cast<char>((val >> 4) & 0xFF));
+  return Error::Ok;
 }
 
 } // namespace detail
 
-inline std::string decode(const std::string_view& input) {
-  ET_CHECK_MSG(!input.empty(), "empty input");
+inline Result<std::string> decode(const std::string_view& input) {
+  ET_CHECK_OR_RETURN_ERROR(!input.empty(), InvalidArgument, "empty input");
 
   // Faster than `input.size() % 4`.
-  ET_CHECK_MSG(
+  ET_CHECK_OR_RETURN_ERROR(
       (input.size() & 3) == 0 && input.size() >= 4,
+      InvalidArgument,
       "input length must be larger than 4 and is multiple of 4, got %zu",
       input.size());
 
@@ -154,21 +176,23 @@ inline std::string decode(const std::string_view& input) {
   output.reserve(input.size() / 4 * 3);
   auto idx = 0U;
   for (; idx < input.size() - 4; idx += 4) {
-    detail::decode(input.substr(idx, 4), output);
+    ET_CHECK_OK_OR_RETURN_ERROR(detail::decode(input.substr(idx, 4), output));
   }
 
   // Last 4 bytes. Might contain paddings.
   if (input[idx + 3] == '=') {
     if (input[idx + 2] == '=') {
       // Tow paddings.
-      detail::decode_2_padding(input.substr(idx, 2), output);
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          detail::decode_2_padding(input.substr(idx, 2), output));
     } else {
       // One padding.
-      detail::decode_1_padding(input.substr(idx, 3), output);
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          detail::decode_1_padding(input.substr(idx, 3), output));
     }
   } else {
     // No padding.
-    detail::decode(input.substr(idx, 4), output);
+    ET_CHECK_OK_OR_RETURN_ERROR(detail::decode(input.substr(idx, 4), output));
   }
 
   return output;
@@ -176,5 +200,16 @@ inline std::string decode(const std::string_view& input) {
 
 } // namespace base64
 
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace base64 {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::base64::decode;
+} // namespace base64
 } // namespace executor
 } // namespace torch
diff --git a/extension/llm/tokenizer/bpe_tokenizer.cpp b/extension/llm/tokenizer/bpe_tokenizer.cpp
index 07d138548d8..1548f000a5a 100644
--- a/extension/llm/tokenizer/bpe_tokenizer.cpp
+++ b/extension/llm/tokenizer/bpe_tokenizer.cpp
@@ -10,8 +10,12 @@
 
 #include <cstring>
 
-namespace torch {
-namespace executor {
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+namespace executorch {
+namespace extension {
+namespace llm {
 
 static int compare_tokens(const void* a, const void* b) {
   if (((TokenIndex*)a)->str == nullptr) {
@@ -311,5 +315,6 @@ BPETokenizer::encode(const std::string& text, int8_t bos, int8_t eos) const {
   return Result(tokens);
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/tokenizer/bpe_tokenizer.h b/extension/llm/tokenizer/bpe_tokenizer.h
index 7ea84025832..7fc7306c100 100644
--- a/extension/llm/tokenizer/bpe_tokenizer.h
+++ b/extension/llm/tokenizer/bpe_tokenizer.h
@@ -11,8 +11,9 @@
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <memory>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 struct TokenIndex {
   const char* str;
@@ -26,13 +27,14 @@ class BPETokenizer : public Tokenizer {
   explicit BPETokenizer();
   ~BPETokenizer() override;
 
-  Error load(const std::string& tokenizer_path) override;
+  ::executorch::runtime::Error load(const std::string& tokenizer_path) override;
 
-  Result<std::vector<uint64_t>>
+  ::executorch::runtime::Result<std::vector<uint64_t>>
   encode(const std::string& input, int8_t bos, int8_t eos) const override;
 
-  Result<std::string> decode(uint64_t prev_token, uint64_t token)
-      const override;
+  ::executorch::runtime::Result<std::string> decode(
+      uint64_t prev_token,
+      uint64_t token) const override;
 
  private:
   std::unique_ptr<char*[]> vocab_ = nullptr;
@@ -41,5 +43,16 @@ class BPETokenizer : public Tokenizer {
   unsigned int max_token_length_ = 0;
   unsigned char byte_pieces_[512]; // stores all single-byte strings
 };
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::BPETokenizer;
+using ::executorch::extension::llm::TokenIndex;
 } // namespace executor
 } // namespace torch
diff --git a/extension/llm/tokenizer/targets.bzl b/extension/llm/tokenizer/targets.bzl
index f8e4df095ca..fa6cc915c4b 100644
--- a/extension/llm/tokenizer/targets.bzl
+++ b/extension/llm/tokenizer/targets.bzl
@@ -11,36 +11,20 @@ def define_common_targets():
         srcs = [
             "__init__.py",
             "tokenizer.py",
+            "utils.py",
         ],
         base_module = "executorch.extension.llm.tokenizer",
         visibility = [
             "//executorch/examples/...",
             "//executorch/extension/llm/tokenizer/...",
+            "//executorch/extension/llm/export/...",
             "//bento/...",
             "//bento_kernels/...",
         ],
         _is_external_target = True,
-        external_deps = [
-            "sentencepiece-py",
-        ],
-    )
-
-    runtime.python_library(
-        name = "utils",
-        srcs = [
-            "utils.py",
-        ],
-        base_module = "executorch.extension.llm.utils",
-        visibility = [
-            "//executorch/examples/...",
-            "//executorch/extension/llm/tokenizer/...",
-            "//bento/...",
-            "//bento_kernels/...",
-        ],
         deps = [
-            "//executorch/examples/models/llama2/tokenizer:tiktoken",
+            "//executorch/examples/models/llama2/tokenizer:tiktoken_py",
         ],
-        _is_external_target = True,
         external_deps = [
             "sentencepiece-py",
         ],
diff --git a/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_base64.model b/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_base64.model
new file mode 100644
index 00000000000..2d9c39f19d6
--- /dev/null
+++ b/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_base64.model
@@ -0,0 +1 @@
+tet 0
diff --git a/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_rank.model b/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_rank.model
new file mode 100644
index 00000000000..07d43b1e439
--- /dev/null
+++ b/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_rank.model
@@ -0,0 +1 @@
+ICAgICAgIA== 18446744073709551616
diff --git a/extension/llm/tokenizer/test/resources/test_tiktoken_no_space.model b/extension/llm/tokenizer/test/resources/test_tiktoken_no_space.model
new file mode 100644
index 00000000000..c025dddd3ba
--- /dev/null
+++ b/extension/llm/tokenizer/test/resources/test_tiktoken_no_space.model
@@ -0,0 +1 @@
+ICAgICAgIA==10
diff --git a/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp b/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp
index 17bb83e2f4c..c553fe59f98 100644
--- a/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp
+++ b/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp
@@ -13,13 +13,15 @@
 
 using namespace ::testing;
 
-namespace torch {
-namespace executor {
+using ::executorch::extension::llm::BPETokenizer;
+using ::executorch::extension::llm::Tokenizer;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
 
 class TokenizerExtensionTest : public Test {
  public:
   void SetUp() override {
-    torch::executor::runtime_init();
+    executorch::runtime::runtime_init();
     tokenizer_ = std::make_unique<BPETokenizer>();
     modelPath_ =
         std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin");
@@ -65,6 +67,3 @@ TEST_F(TokenizerExtensionTest, SafeToDestruct) {
   tokenizer_ = std::make_unique<BPETokenizer>();
   tokenizer_.reset();
 }
-
-} // namespace executor
-} // namespace torch
diff --git a/extension/llm/tokenizer/test/test_tiktoken.cpp b/extension/llm/tokenizer/test/test_tiktoken.cpp
index f423183b8a8..ce2a781aa1c 100644
--- a/extension/llm/tokenizer/test/test_tiktoken.cpp
+++ b/extension/llm/tokenizer/test/test_tiktoken.cpp
@@ -8,13 +8,16 @@
 
 #include <executorch/extension/llm/tokenizer/tiktoken.h>
 #include <executorch/runtime/platform/runtime.h>
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <sstream>
 #include <vector>
 
 using namespace ::testing;
-
-namespace torch {
-namespace executor {
+using ::executorch::extension::llm::Tiktoken;
+using ::executorch::extension::llm::Tokenizer;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
 
 namespace {
 // Test case based on Llama 2
@@ -49,7 +52,7 @@ static inline std::unique_ptr<std::vector<std::string>> _get_special_tokens() {
 class TiktokenExtensionTest : public Test {
  public:
   void SetUp() override {
-    torch::executor::runtime_init();
+    executorch::runtime::runtime_init();
     tokenizer_ = std::make_unique<Tiktoken>(
         _get_special_tokens(), kBOSTokenIndex, kEOSTokenIndex);
     modelPath_ = std::getenv("RESOURCES_PATH") +
@@ -139,5 +142,47 @@ TEST_F(TiktokenExtensionTest, ConstructionWithInvalidEOSIndex) {
       "");
 #endif
 }
-} // namespace executor
-} // namespace torch
+
+TEST_F(TiktokenExtensionTest, LoadWithInvalidPath) {
+  auto invalidModelPath =
+      std::getenv("RESOURCES_PATH") + std::string("/nonexistent.model");
+
+  Error res = tokenizer_->load(invalidModelPath.c_str());
+  EXPECT_EQ(res, Error::InvalidArgument);
+}
+
+TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithInvalidRank) {
+  auto invalidModelPath = std::getenv("RESOURCES_PATH") +
+      std::string("/test_tiktoken_invalid_rank.model");
+
+  Error res = tokenizer_->load(invalidModelPath.c_str());
+
+  EXPECT_EQ(res, Error::InvalidArgument);
+}
+
+TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithInvalidBase64) {
+  auto invalidModelPath = std::getenv("RESOURCES_PATH") +
+      std::string("/test_tiktoken_invalid_base64.model");
+
+  Error res = tokenizer_->load(invalidModelPath.c_str());
+
+  EXPECT_EQ(res, Error::InvalidArgument);
+}
+
+TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithNoSpace) {
+  auto invalidModelPath = std::getenv("RESOURCES_PATH") +
+      std::string("/test_tiktoken_no_space.model");
+
+  Error res = tokenizer_->load(invalidModelPath.c_str());
+
+  EXPECT_EQ(res, Error::InvalidArgument);
+}
+
+TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithBPEFile) {
+  auto invalidModelPath =
+      std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin");
+
+  Error res = tokenizer_->load(invalidModelPath.c_str());
+
+  EXPECT_EQ(res, Error::InvalidArgument);
+}
diff --git a/extension/llm/tokenizer/tiktoken.cpp b/extension/llm/tokenizer/tiktoken.cpp
index 67d1f916f2a..f99ac2e955e 100644
--- a/extension/llm/tokenizer/tiktoken.cpp
+++ b/extension/llm/tokenizer/tiktoken.cpp
@@ -27,11 +27,16 @@
 
 #include <executorch/extension/llm/tokenizer/base64.h>
 #include <executorch/extension/llm/tokenizer/tiktoken.h>
+#include <executorch/runtime/core/result.h>
 #include <fstream>
 #include <limits>
 
-namespace torch {
-namespace executor {
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+namespace executorch {
+namespace extension {
+namespace llm {
 
 // ------------------------------Util start------------------------------------
 
@@ -61,33 +66,43 @@ static Re2UPtr _build_special_token_regex(const Encoder& special_encoder) {
   return _create_regex(special_pattern);
 }
 
-static std::pair<std::string, uint64_t> _parse(const std::string& line) {
+static Result<std::pair<std::string, uint64_t>> _parse(
+    const std::string& line) {
+  // Tiktoken format
+  // https://github.com/openai/tiktoken/blob/main/tiktoken/load.py#L140 <base64
+  // encoded token str> <rank>
   auto pos = line.find(" ");
-  ET_CHECK_MSG(
-      pos != std::string::npos, "invalid encoder line: %s", line.c_str());
+  ET_CHECK_OR_RETURN_ERROR(
+      pos != std::string::npos,
+      InvalidArgument,
+      "invalid tiktoken line: %s",
+      line.c_str());
 
-  auto token = base64::decode({line.data(), pos});
+  auto token = ET_UNWRAP(base64::decode({line.data(), pos}));
   uint64_t rank = 0;
   try {
     rank = std::stoul(line.substr(pos + 1));
   } catch (const std::exception&) {
-    ET_CHECK_MSG(false, "invalid encoder rank: %s", line.c_str());
+    ET_CHECK_OR_RETURN_ERROR(
+        false, InvalidArgument, "invalid encoder rank: %s", line.c_str());
   }
 
-  return {std::move(token), rank};
+  return std::pair{std::move(token), rank};
 }
 
-static Encoder _load_encoder(const std::string& path) {
+static Result<Encoder> _load_encoder(const std::string& path) {
   std::ifstream file(path);
-  ET_CHECK_MSG(file, "failed to open encoder file: %s", path.c_str());
+  ET_CHECK_OR_RETURN_ERROR(
+      file, InvalidArgument, "failed to open encoder file: %s", path.c_str());
 
   Encoder encoder;
   std::string line;
   while (std::getline(file, line)) {
-    auto [token, rank] = _parse(line);
+    auto [token, rank] = ET_UNWRAP(_parse(line));
 
-    ET_CHECK_MSG(
+    ET_CHECK_OR_RETURN_ERROR(
         encoder.emplace(std::move(token), rank).second,
+        InvalidArgument,
         "duplicate item: %s",
         line.c_str());
   }
@@ -95,13 +110,16 @@ static Encoder _load_encoder(const std::string& path) {
   return encoder;
 }
 
-static Decoder _build_decoder(const Encoder& encoder) {
+static Result<Decoder> _build_decoder(const Encoder& encoder) {
   Decoder decoder;
   for (const auto& [k, v] : encoder) {
     decoder.emplace(v, k);
   }
 
-  ET_CHECK_MSG(encoder.size() == decoder.size(), "duplicate items in encoder");
+  ET_CHECK_OR_RETURN_ERROR(
+      encoder.size() == decoder.size(),
+      InvalidArgument,
+      "duplicate items in encoder");
 
   return decoder;
 }
@@ -248,7 +266,11 @@ Tiktoken::_split_with_allowed_special_token(
     return std::make_pair(std::nullopt, input);
   }
 
+#if __cplusplus >= 202002L
   auto start = input.begin();
+#else
+  const char* start = input.data();
+#endif
   std::string special;
   while (true) {
     if (!re2::RE2::FindAndConsume(&input, *_special_token_regex, &special)) {
@@ -258,9 +280,15 @@ Tiktoken::_split_with_allowed_special_token(
 
     if (allowed_special.count(special) == 1) {
       // Found an allowed special token, split the text with it.
+#if __cplusplus >= 202002L
       return std::make_pair(
           special,
           re2::StringPiece(start, input.begin() - start - special.size()));
+#else
+      return std::make_pair(
+          special,
+          re2::StringPiece(start, (input.data() - start) - special.size()));
+#endif
     } // else try to find the next special token
   }
 
@@ -352,11 +380,11 @@ Tiktoken::Tiktoken(
 }
 
 Error Tiktoken::load(const std::string& path) {
-  _encoder = _load_encoder(path);
+  _encoder = ET_UNWRAP(_load_encoder(path));
   _special_token_encoder = _build_special_token_encoder(_encoder.size());
 
-  _decoder = _build_decoder(_encoder);
-  _special_token_decoder = _build_decoder(_special_token_encoder);
+  _decoder = ET_UNWRAP(_build_decoder(_encoder));
+  _special_token_decoder = ET_UNWRAP(_build_decoder(_special_token_encoder));
 
   _regex = _create_regex(_pattern);
   // Warmup re2 as it is slow on the first run, void the return value as it's
@@ -389,7 +417,7 @@ Tiktoken::encode(const std::string& text, int8_t bos, int8_t eos) const {
   for (auto i = 0; i < eos; ++i) {
     res.push_back(eos_tok_);
   }
-  return Result(res);
+  return Result<std::vector<uint64_t>>(std::move(res));
 }
 
 Result<std::string> Tiktoken::decode(uint64_t prev, uint64_t cur) const {
@@ -415,5 +443,6 @@ Result<std::string> Tiktoken::decode(uint64_t prev, uint64_t cur) const {
 }
 // -------------------------public method end-------------------------------
 
-} // namespace executor
-} // namespace torch
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/tokenizer/tiktoken.h b/extension/llm/tokenizer/tiktoken.h
index 0b1b1fa61e0..7d78f8b60da 100644
--- a/extension/llm/tokenizer/tiktoken.h
+++ b/extension/llm/tokenizer/tiktoken.h
@@ -14,8 +14,9 @@
 #include <optional>
 #include <unordered_map>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 using Encoder = std::unordered_map<std::string, uint64_t>;
 using Decoder = std::unordered_map<uint64_t, std::string>;
@@ -33,13 +34,14 @@ class Tiktoken : public Tokenizer {
       size_t bos_token_index,
       size_t eos_token_index);
 
-  Error load(const std::string& tokenizer_path) override;
+  ::executorch::runtime::Error load(const std::string& tokenizer_path) override;
 
-  Result<std::vector<uint64_t>>
+  ::executorch::runtime::Result<std::vector<uint64_t>>
   encode(const std::string& input, int8_t bos, int8_t eos) const override;
 
-  Result<std::string> decode(uint64_t prev_token, uint64_t token)
-      const override;
+  ::executorch::runtime::Result<std::string> decode(
+      uint64_t prev_token,
+      uint64_t token) const override;
 
  private:
   template <typename T>
@@ -74,5 +76,18 @@ class Tiktoken : public Tokenizer {
   Re2UPtr _regex;
   Re2UPtr _special_token_regex;
 };
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::Decoder;
+using ::executorch::extension::llm::Encoder;
+using ::executorch::extension::llm::Re2UPtr;
+using ::executorch::extension::llm::Tiktoken;
 } // namespace executor
 } // namespace torch
diff --git a/extension/llm/tokenizer/tokenizer.h b/extension/llm/tokenizer/tokenizer.h
index b49dc245eb3..3115cbdff70 100644
--- a/extension/llm/tokenizer/tokenizer.h
+++ b/extension/llm/tokenizer/tokenizer.h
@@ -17,8 +17,9 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 // A tokenizer interface.
 class Tokenizer {
@@ -26,15 +27,16 @@ class Tokenizer {
   explicit Tokenizer() {}
   virtual ~Tokenizer() {}
 
-  virtual Error load(const std::string& tokenizer_path) = 0;
+  virtual ::executorch::runtime::Error load(
+      const std::string& tokenizer_path) = 0;
 
-  virtual Result<std::vector<uint64_t>>
+  virtual ::executorch::runtime::Result<std::vector<uint64_t>>
   encode(const std::string& input, int8_t bos, int8_t eos) const = 0;
 
-  Error decode_verify(uint64_t token) const {
+  ::executorch::runtime::Error decode_verify(uint64_t token) const {
     if (!initialized_) {
       ET_LOG(Error, "Tokenizer not initialized");
-      return Error::NotSupported;
+      return ::executorch::runtime::Error::NotSupported;
     }
     if (token >= vocab_size_) {
       ET_LOG(
@@ -42,13 +44,14 @@ class Tokenizer {
           "token  %" PRIu64 " is out side of vacab range %d",
           token,
           vocab_size_);
-      return Error::NotSupported;
+      return ::executorch::runtime::Error::NotSupported;
     }
-    return Error::Ok;
+    return ::executorch::runtime::Error::Ok;
   }
 
-  virtual Result<std::string> decode(uint64_t prev_token, uint64_t token)
-      const = 0;
+  virtual ::executorch::runtime::Result<std::string> decode(
+      uint64_t prev_token,
+      uint64_t token) const = 0;
 
   // getters
   int32_t vocab_size() const {
@@ -70,5 +73,14 @@ class Tokenizer {
   uint64_t eos_tok_ = 0;
 };
 
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::Tokenizer;
 } // namespace executor
 } // namespace torch
diff --git a/extension/memory_allocator/malloc_memory_allocator.h b/extension/memory_allocator/malloc_memory_allocator.h
index 6625f587aad..7e1cf8b2abc 100644
--- a/extension/memory_allocator/malloc_memory_allocator.h
+++ b/extension/memory_allocator/malloc_memory_allocator.h
@@ -14,9 +14,8 @@
 
 #include <executorch/runtime/core/memory_allocator.h>
 
-namespace torch {
-namespace executor {
-namespace util {
+namespace executorch {
+namespace extension {
 
 /**
  * Dynamically allocates memory using malloc() and frees all pointers at
@@ -25,7 +24,7 @@ namespace util {
  * For systems with malloc(), this can be easier than using a fixed-sized
  * MemoryAllocator.
  */
-class MallocMemoryAllocator : public MemoryAllocator {
+class MallocMemoryAllocator : public executorch::runtime::MemoryAllocator {
  public:
   /**
    * Construct a new Malloc memory allocator via an optional alignment size
@@ -76,6 +75,16 @@ class MallocMemoryAllocator : public MemoryAllocator {
  private:
   std::vector<void*> mem_ptrs_;
 };
+
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace util {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::MallocMemoryAllocator;
 } // namespace util
 } // namespace executor
 } // namespace torch
diff --git a/extension/memory_allocator/test/malloc_memory_allocator_test.cpp b/extension/memory_allocator/test/malloc_memory_allocator_test.cpp
index 05dfafa2061..fc2db04bc84 100644
--- a/extension/memory_allocator/test/malloc_memory_allocator_test.cpp
+++ b/extension/memory_allocator/test/malloc_memory_allocator_test.cpp
@@ -12,17 +12,16 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using torch::executor::util::MallocMemoryAllocator;
+using executorch::extension::MallocMemoryAllocator;
 
-constexpr auto kDefaultAlignment =
-    torch::executor::util::MallocMemoryAllocator::kDefaultAlignment;
+constexpr auto kDefaultAlignment = MallocMemoryAllocator::kDefaultAlignment;
 
 class MallocMemoryAllocatorTest : public ::testing::Test {
  protected:
   void SetUp() override {
     // Since these tests cause ET_LOG to be called, the PAL must be initialized
     // first.
-    torch::executor::runtime_init();
+    executorch::runtime::runtime_init();
   }
 };
 
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 6cee0185c74..58ada0c246f 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -33,91 +33,85 @@
         std::move(*et_result__));                                      \
   })
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
 
 Module::Module(
     const std::string& file_path,
-    const Module::LoadMode load_mode,
-    std::unique_ptr<EventTracer> event_tracer)
+    const LoadMode load_mode,
+    std::unique_ptr<runtime::EventTracer> event_tracer)
     : file_path_(file_path),
       load_mode_(load_mode),
-      memory_allocator_(std::make_unique<util::MallocMemoryAllocator>()),
-      temp_allocator_(std::make_unique<util::MallocMemoryAllocator>()),
+      memory_allocator_(std::make_unique<MallocMemoryAllocator>()),
+      temp_allocator_(std::make_unique<MallocMemoryAllocator>()),
       event_tracer_(std::move(event_tracer)) {
-  runtime_init();
+  runtime::runtime_init();
 }
 
 Module::Module(
-    std::unique_ptr<DataLoader> data_loader,
-    std::unique_ptr<MemoryAllocator> memory_allocator,
-    std::unique_ptr<MemoryAllocator> temp_allocator,
-    std::unique_ptr<EventTracer> event_tracer)
+    std::unique_ptr<runtime::DataLoader> data_loader,
+    std::unique_ptr<runtime::MemoryAllocator> memory_allocator,
+    std::unique_ptr<runtime::MemoryAllocator> temp_allocator,
+    std::unique_ptr<runtime::EventTracer> event_tracer)
     : data_loader_(std::move(data_loader)),
       memory_allocator_(
           memory_allocator ? std::move(memory_allocator)
-                           : std::make_unique<util::MallocMemoryAllocator>()),
+                           : std::make_unique<MallocMemoryAllocator>()),
       temp_allocator_(
           temp_allocator ? std::move(temp_allocator)
-                         : std::make_unique<util::MallocMemoryAllocator>()),
+                         : std::make_unique<MallocMemoryAllocator>()),
       event_tracer_(std::move(event_tracer)) {
-  runtime_init();
+  runtime::runtime_init();
 }
 
 Module::Module(
-    std::shared_ptr<Program> program,
-    std::unique_ptr<MemoryAllocator> memory_allocator,
-    std::unique_ptr<MemoryAllocator> temp_allocator,
-    std::unique_ptr<EventTracer> event_tracer)
+    std::shared_ptr<runtime::Program> program,
+    std::unique_ptr<runtime::MemoryAllocator> memory_allocator,
+    std::unique_ptr<runtime::MemoryAllocator> temp_allocator,
+    std::unique_ptr<runtime::EventTracer> event_tracer)
     : program_(std::move(program)),
       memory_allocator_(
           memory_allocator ? std::move(memory_allocator)
-                           : std::make_unique<util::MallocMemoryAllocator>()),
+                           : std::make_unique<MallocMemoryAllocator>()),
       temp_allocator_(
           temp_allocator ? std::move(temp_allocator)
-                         : std::make_unique<util::MallocMemoryAllocator>()),
+                         : std::make_unique<MallocMemoryAllocator>()),
       event_tracer_(std::move(event_tracer)) {
-  runtime_init();
+  runtime::runtime_init();
 }
 
-Error Module::load(const Program::Verification verification) {
+runtime::Error Module::load(const runtime::Program::Verification verification) {
   if (!is_loaded()) {
     if (!data_loader_) {
       switch (load_mode_) {
         case LoadMode::File:
           data_loader_ =
-              ET_UNWRAP_UNIQUE(util::FileDataLoader::from(file_path_.c_str()));
+              ET_UNWRAP_UNIQUE(FileDataLoader::from(file_path_.c_str()));
           break;
         case LoadMode::Mmap:
-          data_loader_ = ET_UNWRAP_UNIQUE(util::MmapDataLoader::from(
-              file_path_.c_str(), util::MmapDataLoader::MlockConfig::NoMlock));
+          data_loader_ = ET_UNWRAP_UNIQUE(MmapDataLoader::from(
+              file_path_.c_str(), MmapDataLoader::MlockConfig::NoMlock));
           break;
         case LoadMode::MmapUseMlock:
           data_loader_ =
-              ET_UNWRAP_UNIQUE(util::MmapDataLoader::from(file_path_.c_str()));
+              ET_UNWRAP_UNIQUE(MmapDataLoader::from(file_path_.c_str()));
           break;
         case LoadMode::MmapUseMlockIgnoreErrors:
-          data_loader_ = ET_UNWRAP_UNIQUE(util::MmapDataLoader::from(
+          data_loader_ = ET_UNWRAP_UNIQUE(MmapDataLoader::from(
               file_path_.c_str(),
-              util::MmapDataLoader::MlockConfig::UseMlockIgnoreErrors));
+              MmapDataLoader::MlockConfig::UseMlockIgnoreErrors));
           break;
       }
     };
-    auto program =
-        ET_UNWRAP_UNIQUE(Program::load(data_loader_.get(), verification));
-    program_ = std::shared_ptr<Program>(
-        program.release(),
-        [data_loader = std::move(data_loader_)](Program* pointer) {
-          delete pointer;
-        });
+    auto program = ET_UNWRAP_UNIQUE(
+        runtime::Program::load(data_loader_.get(), verification));
+    program_ = std::shared_ptr<runtime::Program>(
+        program.release(), [](runtime::Program* pointer) { delete pointer; });
   }
-  return Error::Ok;
+  return runtime::Error::Ok;
 }
 
-bool Module::is_loaded() const {
-  return program_ != nullptr;
-}
-
-Result<std::unordered_set<std::string>> Module::method_names() {
+runtime::Result<std::unordered_set<std::string>> Module::method_names() {
   ET_CHECK_OK_OR_RETURN_ERROR(load());
   const auto method_count = program_->num_methods();
   std::unordered_set<std::string> result;
@@ -129,7 +123,9 @@ Result<std::unordered_set<std::string>> Module::method_names() {
   return result;
 }
 
-Error Module::load_method(const std::string& method_name) {
+runtime::Error Module::load_method(
+    const std::string& method_name,
+    torch::executor::EventTracer* tracer) {
   if (!is_method_loaded(method_name)) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
 
@@ -148,55 +144,95 @@ Error Module::load_method(const std::string& method_name) {
       method_holder.planned_spans.emplace_back(
           method_holder.planned_buffers.back().data(), buffer_size);
     }
-    method_holder.planned_memory = std::make_unique<HierarchicalAllocator>(Span(
-        method_holder.planned_spans.data(),
-        method_holder.planned_spans.size()));
-    method_holder.memory_manager = std::make_unique<MemoryManager>(
+    method_holder.planned_memory =
+        std::make_unique<runtime::HierarchicalAllocator>(runtime::Span(
+            method_holder.planned_spans.data(),
+            method_holder.planned_spans.size()));
+    method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
         memory_allocator_.get(),
         method_holder.planned_memory.get(),
         temp_allocator_.get());
     method_holder.method = ET_UNWRAP_UNIQUE(program_->load_method(
-        method_name.c_str(),
-        method_holder.memory_manager.get(),
-        event_tracer_.get()));
+        method_name.c_str(), method_holder.memory_manager.get(), tracer));
+    method_holder.inputs.resize(method_holder.method->inputs_size());
     methods_.emplace(method_name, std::move(method_holder));
   }
-  return Error::Ok;
-}
-
-bool Module::is_method_loaded(const std::string& method_name) const {
-  return methods_.count(method_name);
+  return runtime::Error::Ok;
 }
 
-Result<MethodMeta> Module::method_meta(const std::string& method_name) {
+runtime::Result<runtime::MethodMeta> Module::method_meta(
+    const std::string& method_name) {
   ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
   return methods_.at(method_name).method->method_meta();
 }
 
-Result<std::vector<EValue>> Module::execute(
+runtime::Result<std::vector<runtime::EValue>> Module::execute(
     const std::string& method_name,
-    const std::vector<EValue>& input) {
+    const std::vector<runtime::EValue>& input_values) {
   ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
   auto& method = methods_.at(method_name).method;
+  auto& inputs = methods_.at(method_name).inputs;
 
-  for (auto index = 0; index < input.size(); ++index) {
-    ET_CHECK_OK_OR_RETURN_ERROR(method->set_input(input[index], index));
+  for (size_t i = 0; i < input_values.size(); ++i) {
+    if (!input_values[i].isNone()) {
+      inputs[i] = input_values[i];
+    }
+  }
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    ET_CHECK_OR_RETURN_ERROR(
+        !inputs[i].isNone(), InvalidArgument, "input %zu is none", i);
   }
+  ET_CHECK_OK_OR_RETURN_ERROR(method->set_inputs(
+      exec_aten::ArrayRef<runtime::EValue>(inputs.data(), inputs.size())));
   ET_CHECK_OK_OR_RETURN_ERROR(method->execute());
 
   const auto outputs_size = method->outputs_size();
-  std::vector<EValue> outputs(outputs_size);
+  std::vector<runtime::EValue> outputs(outputs_size);
   ET_CHECK_OK_OR_RETURN_ERROR(
       method->get_outputs(outputs.data(), outputs_size));
 
   return outputs;
 }
 
-Error Module::set_output_data_ptr(Tensor& output_tensor, size_t output_index) {
-  ET_CHECK_OK_OR_RETURN_ERROR(load_method("forward"));
-  auto& method = methods_.at("forward").method;
+runtime::Error Module::set_input(
+    const std::string& method_name,
+    const runtime::EValue& input_value,
+    size_t input_index) {
+  ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
+  methods_.at(method_name).inputs.at(input_index) = input_value;
+  return runtime::Error::Ok;
+}
+
+runtime::Error Module::set_inputs(
+    const std::string& method_name,
+    const std::vector<runtime::EValue>& input_values) {
+  ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
+  auto& inputs = methods_.at(method_name).inputs;
+  ET_CHECK_OR_RETURN_ERROR(
+      inputs.size() == input_values.size(),
+      InvalidArgument,
+      "input size: %zu does not match method input size: %zu",
+      input_values.size(),
+      inputs.size());
+  inputs = input_values;
+  return runtime::Error::Ok;
+}
+
+runtime::Error Module::set_output(
+    const std::string& method_name,
+    runtime::EValue output_value,
+    size_t output_index) {
+  ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
+  auto& method = methods_.at(method_name).method;
+  ET_CHECK_OR_RETURN_ERROR(
+      output_value.isTensor(),
+      InvalidArgument,
+      "output type: %zu is not tensor",
+      (size_t)output_value.tag);
+  const auto& output_tensor = output_value.toTensor();
   return method->set_output_data_ptr(
       output_tensor.mutable_data_ptr(), output_tensor.nbytes(), output_index);
 }
 
-} // namespace torch::executor
+} // namespace extension
+} // namespace executorch
diff --git a/extension/module/module.h b/extension/module/module.h
index da09141659c..f7c9b1c8c56 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -16,12 +16,13 @@
 
 #include <executorch/runtime/executor/program.h>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
 
 /**
  * A facade class for loading programs and executing methods within them.
  */
-class Module final {
+class Module {
  public:
   /**
    * Enum to define loading behavior.
@@ -43,11 +44,12 @@ class Module final {
    *
    * @param[in] file_path The path to the ExecuTorch program file to load.
    * @param[in] load_mode The loading mode to use.
+   * @param[in] event_tracer A EventTracer used for tracking and logging events.
    */
   explicit Module(
       const std::string& file_path,
       const LoadMode load_mode = LoadMode::MmapUseMlock,
-      std::unique_ptr<EventTracer> event_tracer = nullptr);
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
 
   /**
    * Constructs an instance with the provided data loader and memory allocator.
@@ -59,10 +61,10 @@ class Module final {
    * @param[in] event_tracer A EventTracer used for tracking and logging events.
    */
   explicit Module(
-      std::unique_ptr<DataLoader> data_loader,
-      std::unique_ptr<MemoryAllocator> memory_allocator = nullptr,
-      std::unique_ptr<MemoryAllocator> temp_allocator = nullptr,
-      std::unique_ptr<EventTracer> event_tracer = nullptr);
+      std::unique_ptr<runtime::DataLoader> data_loader,
+      std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
+      std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
 
   /**
    * Constructs an instance using an existing shared program.
@@ -75,10 +77,10 @@ class Module final {
    * @param[in] event_tracer A EventTracer used for tracking and logging events.
    */
   explicit Module(
-      std::shared_ptr<Program> program,
-      std::unique_ptr<MemoryAllocator> memory_allocator = nullptr,
-      std::unique_ptr<MemoryAllocator> temp_allocator = nullptr,
-      std::unique_ptr<EventTracer> event_tracer = nullptr);
+      std::shared_ptr<runtime::Program> program,
+      std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
+      std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
 
   Module(const Module&) = delete;
   Module& operator=(const Module&) = delete;
@@ -94,16 +96,18 @@ class Module final {
    * @returns An Error to indicate success or failure of the loading process.
    */
   ET_NODISCARD
-  Error load(
-      const Program::Verification verification =
-          Program::Verification::Minimal);
+  runtime::Error load(
+      const runtime::Program::Verification verification =
+          runtime::Program::Verification::Minimal);
 
   /**
    * Checks if the program is loaded.
    *
    * @returns true if the program is loaded, false otherwise.
    */
-  bool is_loaded() const;
+  inline bool is_loaded() const {
+    return program_ != nullptr;
+  }
 
   /**
    * Get the program. The data loader used by the program is guaranteed to be
@@ -111,7 +115,7 @@ class Module final {
    *
    * @returns Shared pointer to the program or nullptr if it's not yet loaded.
    */
-  std::shared_ptr<Program> program() const {
+  inline std::shared_ptr<runtime::Program> program() const {
     return program_;
   }
 
@@ -122,18 +126,35 @@ class Module final {
    * @returns A set of strings containing the names of the methods, or an error
    * if the program or method failed to load.
    */
-  Result<std::unordered_set<std::string>> method_names();
+  runtime::Result<std::unordered_set<std::string>> method_names();
 
   /**
    * Load a specific method from the program and set up memory management if
    * needed. The loaded method is cached to reuse the next time it's executed.
    *
    * @param[in] method_name The name of the method to load.
+   * @param[in] event_tracer A EventTracer used for tracking and logging events.
    *
    * @returns An Error to indicate success or failure.
    */
   ET_NODISCARD
-  Error load_method(const std::string& method_name);
+  runtime::Error load_method(
+      const std::string& method_name,
+      torch::executor::EventTracer* event_tracer = nullptr);
+
+  /**
+   * Load the 'forward' method from the program and set up memory management if
+   * needed. The loaded method is cached to reuse the next time it's executed.
+   *
+   * @param[in] event_tracer An event tracer used for tracking and logging
+   * events.
+   *
+   * @returns An Error to indicate success or failure.
+   */
+  ET_NODISCARD inline runtime::Error load_forward(
+      torch::executor::EventTracer* event_tracer = nullptr) {
+    return load_method("forward", event_tracer);
+  }
 
   /**
    * Checks if a specific method is loaded.
@@ -143,7 +164,9 @@ class Module final {
    * @returns true if the method specified by method_name is loaded, false
    * otherwise.
    */
-  bool is_method_loaded(const std::string& method_name) const;
+  inline bool is_method_loaded(const std::string& method_name) const {
+    return methods_.count(method_name);
+  }
 
   /**
    * Get a method metadata struct by method name.
@@ -154,22 +177,40 @@ class Module final {
    * @returns A method metadata, or an error if the program or method failed to
    * load.
    */
-  Result<MethodMeta> method_meta(const std::string& method_name);
+  runtime::Result<runtime::MethodMeta> method_meta(
+      const std::string& method_name);
 
   /**
-   * Execute a specific method with the given input and retrieve output.
-   * Loads the program and method before executing if needed.
+   * Execute a specific method with the given input values and retrieve the
+   * output values. Loads the program and method before executing if needed.
    *
    * @param[in] method_name The name of the method to execute.
-   * @param[in] input A vector of input values to be passed to the method.
+   * @param[in] input_values A vector of input values to be passed to the
+   * method.
    *
    * @returns A Result object containing either a vector of output values
    *          from the method or an error to indicate failure.
    */
   ET_NODISCARD
-  Result<std::vector<EValue>> execute(
+  runtime::Result<std::vector<runtime::EValue>> execute(
       const std::string& method_name,
-      const std::vector<EValue>& input);
+      const std::vector<runtime::EValue>& input_values);
+
+  /**
+   * Execute a specific method with a single input value.
+   * Loads the program and method before executing if needed.
+   *
+   * @param[in] method_name The name of the method to execute.
+   * @param[in] input_value A value to be passed to the method.
+   *
+   * @returns A Result object containing either a vector of output values
+   *          from the method or an error to indicate failure.
+   */
+  ET_NODISCARD inline runtime::Result<std::vector<runtime::EValue>> execute(
+      const std::string& method_name,
+      const runtime::EValue& input_value) {
+    return execute(method_name, std::vector<runtime::EValue>{input_value});
+  }
 
   /**
    * Execute a specific method without any input values.
@@ -180,32 +221,48 @@ class Module final {
    * @returns A Result object containing either a vector of output values
    *          from the method or an error to indicate failure.
    */
-  ET_NODISCARD
-  Result<std::vector<EValue>> execute(const std::string& method_name) {
-    return execute(method_name, {});
+  ET_NODISCARD inline runtime::Result<std::vector<runtime::EValue>> execute(
+      const std::string& method_name) {
+    return execute(method_name, std::vector<runtime::EValue>{});
   }
 
   /**
-   * Retrieve the output value of a specific method with the given input.
+   * Retrieve the output value of a specific method with the given input values.
    * Loads the program and method before execution if needed.
    *
    * @param[in] method_name The name of the method to execute.
-   * @param[in] input A vector of input values to be passed to the method.
+   * @param[in] input_values A vector of input values to be passed to the
+   * method.
    *
    * @returns A Result object containing either the first output value from the
    * method or an error to indicate failure.
    */
-  ET_NODISCARD
-  Result<EValue> get(
+  ET_NODISCARD inline runtime::Result<runtime::EValue> get(
       const std::string& method_name,
-      const std::vector<EValue>& input) {
-    auto result = ET_UNWRAP(execute(method_name, input));
+      const std::vector<runtime::EValue>& input_values) {
+    auto result = ET_UNWRAP(execute(method_name, input_values));
     if (result.empty()) {
-      return Error::InvalidArgument;
+      return runtime::Error::InvalidArgument;
     }
     return result[0];
   }
 
+  /**
+   * Retrieve the output value of a specific method with a single input value.
+   * Loads the program and method before execution if needed.
+   *
+   * @param[in] method_name The name of the method to execute.
+   * @param[in] input_value A value to be passed to the method.
+   *
+   * @returns A Result object containing either the first output value from the
+   * method or an error to indicate failure.
+   */
+  ET_NODISCARD inline runtime::Result<runtime::EValue> get(
+      const std::string& method_name,
+      const runtime::EValue& input_value) {
+    return get(method_name, std::vector<runtime::EValue>{input_value});
+  }
+
   /**
    * Retrieve the output value of a specific method without any input values.
    * Loads the program and method before execution if needed.
@@ -215,23 +272,37 @@ class Module final {
    * @returns A Result object containing either the first output value from the
    * method or an error to indicate failure.
    */
-  ET_NODISCARD
-  Result<EValue> get(const std::string& method_name) {
-    return get(method_name, {});
+  ET_NODISCARD inline runtime::Result<runtime::EValue> get(
+      const std::string& method_name) {
+    return get(method_name, std::vector<runtime::EValue>{});
+  }
+
+  /**
+   * Execute the 'forward' method with the given input values and retrieve the
+   * output values. Loads the program and method before executing if needed.
+   *
+   * @param[in] input_values A vector of input values for the 'forward' method.
+   *
+   * @returns A Result object containing either a vector of output values
+   *          from the 'forward' method or an error to indicate failure.
+   */
+  ET_NODISCARD inline runtime::Result<std::vector<runtime::EValue>> forward(
+      const std::vector<runtime::EValue>& input_values) {
+    return execute("forward", input_values);
   }
 
   /**
-   * Execute the 'forward' method with the given input and retrieve output.
+   * Execute the 'forward' method with a single value.
    * Loads the program and method before executing if needed.
    *
-   * @param[in] input A vector of input values for the 'forward' method.
+   * @param[in] input_value A value for the 'forward' method.
    *
    * @returns A Result object containing either a vector of output values
    *          from the 'forward' method or an error to indicate failure.
    */
-  ET_NODISCARD
-  Result<std::vector<EValue>> forward(const std::vector<EValue>& input) {
-    return execute("forward", input);
+  ET_NODISCARD inline runtime::Result<std::vector<runtime::EValue>> forward(
+      const runtime::EValue& input_value) {
+    return forward(std::vector<runtime::EValue>{input_value});
   }
 
   /**
@@ -241,9 +312,100 @@ class Module final {
    * @returns A Result object containing either a vector of output values
    *          from the 'forward' method or an error to indicate failure.
    */
+  ET_NODISCARD inline runtime::Result<std::vector<runtime::EValue>> forward() {
+    return forward(std::vector<runtime::EValue>{});
+  }
+
+  /**
+   * Sets a single input value for a specific method.
+   *
+   * @param[in] method_name The name of the method.
+   * @param[in] input_value The EValue to set as the method input.
+   * @param[in] input_index Zero-based index of the input to set.
+   *
+   * @returns An Error to indicate success or failure.
+   */
+  ET_NODISCARD
+  runtime::Error set_input(
+      const std::string& method_name,
+      const runtime::EValue& input_value,
+      size_t input_index);
+
+  /**
+   * Sets a single input value for the "forward" method.
+   *
+   * @param[in] input_value The EValue to set as the method input.
+   * @param[in] input_index Zero-based index of the input to set.
+   *
+   * @returns An Error to indicate success or failure.
+   */
   ET_NODISCARD
-  Result<std::vector<EValue>> forward() {
-    return forward({});
+  inline runtime::Error set_input(
+      const runtime::EValue& input_value,
+      size_t input_index) {
+    return set_input("forward", input_value, input_index);
+  }
+
+  /**
+   * Sets all input values for a specific method.
+   *
+   * @param[in] method_name The name of the method.
+   * @param[in] input_values A vector of EValues to set as the method inputs.
+   *
+   * @returns An Error to indicate success or failure.
+   */
+  ET_NODISCARD
+  runtime::Error set_inputs(
+      const std::string& method_name,
+      const std::vector<runtime::EValue>& input_values);
+
+  /**
+   * Sets all input values for the "forward" method.
+   *
+   * @param[in] input_values A vector of EValues to set as the method inputs.
+   *
+   * @returns An Error to indicate success or failure.
+   */
+  ET_NODISCARD
+  inline runtime::Error set_inputs(
+      const std::vector<runtime::EValue>& input_values) {
+    return set_inputs("forward", input_values);
+  }
+
+  /**
+   * Sets the output tensor for a specific method.
+   *
+   * @param[in] method_name The name of the method.
+   * @param[in] output_value The EValue containing the Tensor to set as the
+   * method output.
+   * @param[in] output_index Zero-based index of the output to set.
+   *
+   * @returns An Error to indicate success or failure.
+   *
+   * @note Only Tensor outputs are currently supported for setting.
+   */
+  ET_NODISCARD
+  runtime::Error set_output(
+      const std::string& method_name,
+      runtime::EValue output_value,
+      size_t output_index = 0);
+
+  /**
+   * Sets the output tensor for the "forward" method.
+   *
+   * @param[in] output_value The EValue containing the Tensor to set as the
+   * method output.
+   * @param[in] output_index Zero-based index of the output to set.
+   *
+   * @returns An Error to indicate success or failure.
+   *
+   * @note Only Tensor outputs are currently supported for setting.
+   */
+  ET_NODISCARD
+  inline runtime::Error set_output(
+      runtime::EValue output_value,
+      size_t output_index = 0) {
+    return set_output("forward", std::move(output_value), output_index);
   }
 
   /**
@@ -254,38 +416,42 @@ class Module final {
    * @returns A pointer to the EventTracer instance. Returns nullptr if no
    * EventTracer is set.
    */
-  EventTracer* event_tracer() const {
+  inline runtime::EventTracer* event_tracer() const {
     return event_tracer_.get();
   }
 
-  /**
-   * Set output data pointer for forward method.
-   *
-   * @param[in] output_tensor A Tensor for the output of 'forward' method.
-   * @param[in] output_index Index of the output in 'forward' method.
-   *
-   * @returns An Error to indicate success or failure of the loading process.
-   */
-  Error set_output_data_ptr(Tensor& output_tensor, size_t output_index);
-
  private:
   struct MethodHolder {
     std::vector<std::vector<uint8_t>> planned_buffers;
-    std::vector<Span<uint8_t>> planned_spans;
-    std::unique_ptr<HierarchicalAllocator> planned_memory;
-    std::unique_ptr<MemoryManager> memory_manager;
-    std::unique_ptr<Method> method;
+    std::vector<runtime::Span<uint8_t>> planned_spans;
+    std::unique_ptr<runtime::HierarchicalAllocator> planned_memory;
+    std::unique_ptr<runtime::MemoryManager> memory_manager;
+    std::unique_ptr<runtime::Method> method;
+    std::vector<runtime::EValue> inputs;
   };
 
  private:
   std::string file_path_;
   LoadMode load_mode_{LoadMode::MmapUseMlock};
-  std::shared_ptr<Program> program_;
-  std::unique_ptr<DataLoader> data_loader_;
-  std::unique_ptr<MemoryAllocator> memory_allocator_;
-  std::unique_ptr<MemoryAllocator> temp_allocator_;
-  std::unique_ptr<EventTracer> event_tracer_;
+  std::shared_ptr<runtime::Program> program_;
+  std::unique_ptr<runtime::DataLoader> data_loader_;
+  std::unique_ptr<runtime::MemoryAllocator> memory_allocator_;
+  std::unique_ptr<runtime::MemoryAllocator> temp_allocator_;
+  std::unique_ptr<runtime::EventTracer> event_tracer_;
+
+ protected:
   std::unordered_map<std::string, MethodHolder> methods_;
+
+  friend class ExecuTorchJni;
 };
 
-} // namespace torch::executor
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::Module;
+} // namespace executor
+} // namespace torch
diff --git a/extension/module/test/CMakeLists.txt b/extension/module/test/CMakeLists.txt
index 94c24488591..ff8a5ee9040 100644
--- a/extension/module/test/CMakeLists.txt
+++ b/extension/module/test/CMakeLists.txt
@@ -32,6 +32,7 @@ et_cxx_test(
   EXTRA_LIBS
   extension_data_loader
   extension_module_static
+  extension_tensor
   portable_kernels
   portable_ops_lib
 )
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
index d5bf1f52dfb..86b9d849a22 100644
--- a/extension/module/test/module_test.cpp
+++ b/extension/module/test/module_test.cpp
@@ -14,15 +14,15 @@
 #include <gtest/gtest.h>
 
 #include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/tensor/tensor.h>
 
-using namespace ::testing;
-
-namespace torch::executor {
+using namespace ::executorch::extension;
+using namespace ::executorch::runtime;
 
 class ModuleTest : public ::testing::Test {
  protected:
   static void SetUpTestSuite() {
-    model_path_ = std::getenv("RESOURCES_PATH") + std::string("/model.pte");
+    model_path_ = std::getenv("RESOURCES_PATH") + std::string("/add.pte");
   }
 
   static std::string model_path_;
@@ -59,7 +59,7 @@ TEST_F(ModuleTest, TestMethodNames) {
   Module module(model_path_);
 
   const auto method_names = module.method_names();
-  EXPECT_TRUE(method_names.ok());
+  EXPECT_EQ(method_names.error(), Error::Ok);
   EXPECT_EQ(method_names.get(), std::unordered_set<std::string>{"forward"});
 }
 
@@ -67,7 +67,7 @@ TEST_F(ModuleTest, TestNonExistentMethodNames) {
   Module module("/path/to/nonexistent/file.pte");
 
   const auto method_names = module.method_names();
-  EXPECT_FALSE(method_names.ok());
+  EXPECT_NE(method_names.error(), Error::Ok);
 }
 
 TEST_F(ModuleTest, TestLoadMethod) {
@@ -93,23 +93,22 @@ TEST_F(ModuleTest, TestMethodMeta) {
   Module module(model_path_);
 
   const auto meta = module.method_meta("forward");
-  EXPECT_TRUE(meta.ok());
+  EXPECT_EQ(meta.error(), Error::Ok);
   EXPECT_STREQ(meta->name(), "forward");
-  EXPECT_EQ(meta->num_inputs(), 1);
+  EXPECT_EQ(meta->num_inputs(), 2);
   EXPECT_EQ(*(meta->input_tag(0)), Tag::Tensor);
   EXPECT_EQ(meta->num_outputs(), 1);
   EXPECT_EQ(*(meta->output_tag(0)), Tag::Tensor);
 
   const auto input_meta = meta->input_tensor_meta(0);
-  EXPECT_TRUE(input_meta.ok());
-  EXPECT_EQ(input_meta->scalar_type(), ScalarType::Float);
-  EXPECT_EQ(input_meta->sizes().size(), 2);
+  EXPECT_EQ(input_meta.error(), Error::Ok);
+  EXPECT_EQ(input_meta->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(input_meta->sizes().size(), 1);
   EXPECT_EQ(input_meta->sizes()[0], 1);
-  EXPECT_EQ(input_meta->sizes()[1], 2);
 
   const auto output_meta = meta->output_tensor_meta(0);
-  EXPECT_TRUE(output_meta.ok());
-  EXPECT_EQ(output_meta->scalar_type(), ScalarType::Float);
+  EXPECT_EQ(output_meta.error(), Error::Ok);
+  EXPECT_EQ(output_meta->scalar_type(), exec_aten::ScalarType::Float);
   EXPECT_EQ(output_meta->sizes().size(), 1);
   EXPECT_EQ(output_meta->sizes()[0], 1);
 }
@@ -118,25 +117,22 @@ TEST_F(ModuleTest, TestNonExistentMethodMeta) {
   Module module("/path/to/nonexistent/file.pte");
 
   const auto meta = module.method_meta("forward");
-  EXPECT_FALSE(meta.ok());
+  EXPECT_NE(meta.error(), Error::Ok);
 }
 
 TEST_F(ModuleTest, TestExecute) {
   Module module(model_path_);
+  auto tensor = make_tensor_ptr({1.f});
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  const auto result = module.execute("forward", {tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
 
-  const auto result = module.execute("forward", {EValue(Tensor(&tensor))});
-  EXPECT_TRUE(result.ok());
   EXPECT_TRUE(module.is_loaded());
   EXPECT_TRUE(module.is_method_loaded("forward"));
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 2, 1e-5);
 }
 
 TEST_F(ModuleTest, TestExecutePreload) {
@@ -145,17 +141,14 @@ TEST_F(ModuleTest, TestExecutePreload) {
   const auto error = module.load();
   EXPECT_EQ(error, Error::Ok);
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  auto tensor = make_tensor_ptr({1.f});
 
-  const auto result = module.execute("forward", {EValue(Tensor(&tensor))});
-  EXPECT_TRUE(result.ok());
+  const auto result = module.execute("forward", {tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 2, 1e-5);
 }
 
 TEST_F(ModuleTest, TestExecutePreload_method) {
@@ -164,17 +157,14 @@ TEST_F(ModuleTest, TestExecutePreload_method) {
   const auto error = module.load_method("forward");
   EXPECT_EQ(error, Error::Ok);
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  auto tensor = make_tensor_ptr({1.f});
 
-  const auto result = module.execute("forward", {EValue(Tensor(&tensor))});
-  EXPECT_TRUE(result.ok());
+  const auto result = module.execute("forward", {tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 2, 1e-5);
 }
 
 TEST_F(ModuleTest, TestExecutePreloadProgramAndMethod) {
@@ -186,17 +176,14 @@ TEST_F(ModuleTest, TestExecutePreloadProgramAndMethod) {
   const auto load_method_error = module.load_method("forward");
   EXPECT_EQ(load_method_error, Error::Ok);
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  auto tensor = make_tensor_ptr({1.f});
 
-  const auto result = module.execute("forward", {EValue(Tensor(&tensor))});
-  EXPECT_TRUE(result.ok());
+  const auto result = module.execute("forward", {tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 2, 1e-5);
 }
 
 TEST_F(ModuleTest, TestExecuteOnNonExistent) {
@@ -204,7 +191,7 @@ TEST_F(ModuleTest, TestExecuteOnNonExistent) {
 
   const auto result = module.execute("forward");
 
-  EXPECT_FALSE(result.ok());
+  EXPECT_NE(result.error(), Error::Ok);
 }
 
 TEST_F(ModuleTest, TestExecuteOnCurrupted) {
@@ -212,55 +199,46 @@ TEST_F(ModuleTest, TestExecuteOnCurrupted) {
 
   const auto result = module.execute("forward");
 
-  EXPECT_FALSE(result.ok());
+  EXPECT_NE(result.error(), Error::Ok);
 }
 
 TEST_F(ModuleTest, TestGet) {
   Module module(model_path_);
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
-
-  const auto result = module.get("forward", {EValue(Tensor(&tensor))});
+  auto tensor = make_tensor_ptr({1.f});
 
-  EXPECT_TRUE(result.ok());
+  const auto result = module.get("forward", {tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
   const auto data = result->toTensor().const_data_ptr<float>();
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 2, 1e-5);
 }
 
 TEST_F(ModuleTest, TestForward) {
   auto module = std::make_unique<Module>(model_path_);
+  auto tensor = make_tensor_ptr({21.f});
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
-  const auto result = module->forward({EValue(Tensor(&tensor))});
-  EXPECT_TRUE(result.ok());
+  const auto result = module->forward({tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 42, 1e-5);
 
-  std::array<float, 2> input2{2, 3};
-  TensorImpl tensor2(
-      ScalarType::Float, sizes.size(), sizes.data(), input2.data());
-  const auto result2 = module->forward({EValue(Tensor(&tensor2))});
-  EXPECT_TRUE(result2.ok());
+  auto tensor2 = make_tensor_ptr({2.f});
+  const auto result2 = module->forward({tensor2, tensor2});
+  EXPECT_EQ(result2.error(), Error::Ok);
 
   const auto data2 = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data2[0], 2.5, 1e-5);
+  EXPECT_NEAR(data2[0], 4, 1e-5);
 }
 
 TEST_F(ModuleTest, TestForwardWithInvalidInputs) {
   Module module(model_path_);
 
-  const auto result = module.forward({EValue()});
+  const auto result = module.forward(EValue());
 
-  EXPECT_FALSE(result.ok());
+  EXPECT_NE(result.error(), Error::Ok);
 }
 
 TEST_F(ModuleTest, TestProgramSharingBetweenModules) {
@@ -275,10 +253,10 @@ TEST_F(ModuleTest, TestProgramSharingBetweenModules) {
   EXPECT_TRUE(module2.is_loaded());
 
   auto method_names1 = module1.method_names();
-  EXPECT_TRUE(method_names1.ok());
+  EXPECT_EQ(method_names1.error(), Error::Ok);
 
   auto method_names2 = module2.method_names();
-  EXPECT_TRUE(method_names2.ok());
+  EXPECT_EQ(method_names2.error(), Error::Ok);
   EXPECT_EQ(method_names1.get(), method_names2.get());
 
   auto load_method_error = module1.load_method("forward");
@@ -292,10 +270,9 @@ TEST_F(ModuleTest, TestProgramSharingBetweenModules) {
 }
 
 TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) {
-  auto loader = util::FileDataLoader::from(model_path_.c_str());
-  EXPECT_TRUE(loader.ok());
-  auto data_loader =
-      std::make_unique<util::FileDataLoader>(std::move(loader.get()));
+  auto loader = FileDataLoader::from(model_path_.c_str());
+  EXPECT_EQ(loader.error(), Error::Ok);
+  auto data_loader = std::make_unique<FileDataLoader>(std::move(loader.get()));
 
   auto module1 = std::make_unique<Module>(std::move(data_loader));
 
@@ -303,34 +280,31 @@ TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) {
   EXPECT_EQ(load_error, Error::Ok);
   EXPECT_TRUE(module1->is_loaded());
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  auto tensor = make_tensor_ptr({1.f});
 
-  auto result1 = module1->execute("forward", {EValue(Tensor(&tensor))});
-  EXPECT_TRUE(result1.ok());
+  const auto result1 = module1->execute("forward", {tensor, tensor});
+  EXPECT_EQ(result1.error(), Error::Ok);
 
   auto module2 = std::make_unique<Module>(module1->program());
 
-  auto result2 = module2->execute("forward", {EValue(Tensor(&tensor))});
-  EXPECT_TRUE(result2.ok());
+  const auto result2 = module2->execute("forward", {tensor, tensor});
+  EXPECT_EQ(result2.error(), Error::Ok);
 
   module1 = std::make_unique<Module>("/path/to/nonexistent/file.pte");
   EXPECT_FALSE(module1->is_loaded());
 
-  auto result3 = module2->execute("forward", {EValue(Tensor(&tensor))});
-  EXPECT_TRUE(result3.ok());
+  const auto result3 = module2->execute("forward", {tensor, tensor});
+  EXPECT_EQ(result3.error(), Error::Ok);
 }
 
 TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) {
   std::shared_ptr<Program> shared_program;
 
   {
-    auto loader = util::FileDataLoader::from(model_path_.c_str());
-    EXPECT_TRUE(loader.ok());
+    auto loader = FileDataLoader::from(model_path_.c_str());
+    EXPECT_EQ(loader.error(), Error::Ok);
     auto data_loader =
-        std::make_unique<util::FileDataLoader>(std::move(loader.get()));
+        std::make_unique<FileDataLoader>(std::move(loader.get()));
     auto* data_loader_ptr = data_loader.get();
 
     Module module(std::move(data_loader));
@@ -351,17 +325,14 @@ TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) {
 
   EXPECT_EQ(module.program(), shared_program);
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  auto tensor = make_tensor_ptr({1.f});
 
-  auto result = module.execute("forward", {EValue(Tensor(&tensor))});
-  EXPECT_TRUE(result.ok());
+  const auto result = module.execute("forward", {tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
 
   auto data = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 2, 1e-5);
 }
 
 TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) {
@@ -379,24 +350,22 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) {
   EXPECT_TRUE(program != nullptr);
 
   auto thread = [](std::shared_ptr<Program> program,
-                   const std::array<float, 2>& input) {
+                   const std::array<float, 1>& input) {
     Module module(program);
-    std::array<int32_t, 2> sizes{1, 2};
-    TensorImpl tensor(
-        ScalarType::Float, sizes.size(), sizes.data(), (void*)input.data());
+    auto tensor = from_blob((void*)input.data(), {1});
 
-    const auto result = module.forward({EValue(Tensor(&tensor))});
-    EXPECT_TRUE(result.ok());
+    const auto result = module.forward({tensor, tensor});
+    EXPECT_EQ(result.error(), Error::Ok);
 
     const auto data = result->at(0).toTensor().const_data_ptr<float>();
-    EXPECT_NEAR(data[0], (input[0] + input[1]) / 2.0, 1e-5);
+    EXPECT_NEAR(data[0], (input[0] * 2), 1e-5);
   };
 
-  std::thread t1(thread, program, std::array<float, 2>{1, 2});
-  std::thread t2(thread, program, std::array<float, 2>{2, 3});
-  std::thread t3(thread, program, std::array<float, 2>{3, 4});
-  std::thread t4(thread, program, std::array<float, 2>{4, 5});
-  std::thread t5(thread, program, std::array<float, 2>{5, 6});
+  std::thread t1(thread, program, std::array<float, 1>{1});
+  std::thread t2(thread, program, std::array<float, 1>{2});
+  std::thread t3(thread, program, std::array<float, 1>{3});
+  std::thread t4(thread, program, std::array<float, 1>{4});
+  std::thread t5(thread, program, std::array<float, 1>{5});
 
   t1.join();
   t2.join();
@@ -405,4 +374,64 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) {
   t5.join();
 }
 
-} // namespace torch::executor
+TEST_F(ModuleTest, TestSetInputsBeforeExecute) {
+  Module module(model_path_);
+
+  auto tensor1 = make_tensor_ptr({4.f});
+  auto tensor2 = make_tensor_ptr({5.f});
+
+  EXPECT_EQ(module.set_inputs({tensor1, tensor2}), Error::Ok);
+
+  const auto result = module.forward();
+  EXPECT_EQ(result.error(), Error::Ok);
+
+  const auto data = result->at(0).toTensor().const_data_ptr<float>();
+  EXPECT_NEAR(data[0], 9, 1e-5);
+}
+
+TEST_F(ModuleTest, TestSetInputCombinedWithExecute) {
+  Module module(model_path_);
+
+  auto tensor1 = make_tensor_ptr({2.f});
+  auto tensor2 = make_tensor_ptr({3.f});
+
+  EXPECT_EQ(module.set_input(tensor2, 1), Error::Ok);
+
+  const auto result = module.forward(tensor1);
+  EXPECT_EQ(result.error(), Error::Ok);
+
+  const auto data = result->at(0).toTensor().const_data_ptr<float>();
+  EXPECT_NEAR(data[0], 5, 1e-5);
+}
+
+TEST_F(ModuleTest, TestPartiallySetInputs) {
+  Module module(model_path_);
+
+  auto tensor = make_tensor_ptr({1.f});
+
+  EXPECT_EQ(module.set_input(tensor, 0), Error::Ok);
+
+  const auto result = module.forward();
+  EXPECT_NE(result.error(), Error::Ok);
+}
+
+TEST_F(ModuleTest, TestUnsetInputs) {
+  Module module(model_path_);
+
+  const auto result = module.forward();
+  EXPECT_NE(result.error(), Error::Ok);
+}
+
+TEST_F(ModuleTest, TestSetOutputInvalidIndex) {
+  Module module(model_path_);
+
+  auto output_tensor = empty({1});
+
+  EXPECT_NE(module.set_output(output_tensor, 1), Error::Ok);
+}
+
+TEST_F(ModuleTest, TestSetOutputInvalidType) {
+  Module module(model_path_);
+
+  EXPECT_NE(module.set_output(EValue()), Error::Ok);
+}
diff --git a/extension/module/test/resources/README.md b/extension/module/test/resources/README.md
new file mode 100644
index 00000000000..e2b54633fae
--- /dev/null
+++ b/extension/module/test/resources/README.md
@@ -0,0 +1,11 @@
+## Resources
+
+### model.pte
+- Internally generated after D62209852, 2024-09-06 with:
+    ```
+    buck2 run fbcode//executorch/examples/portable/scripts:export -- --model_name="add"
+    ```
+- In OSS, the same file can be generated after [#5145](https://github.com/pytorch/executorch/pull/5145), 2024-09-06 with:
+    ```
+    python -m examples.portable.scripts.export --model_name="add"
+    ```
diff --git a/extension/module/test/resources/add.pte b/extension/module/test/resources/add.pte
new file mode 100644
index 00000000000..43252ca7d3d
Binary files /dev/null and b/extension/module/test/resources/add.pte differ
diff --git a/extension/module/test/resources/model.pte b/extension/module/test/resources/model.pte
deleted file mode 100644
index 91b52416847..00000000000
Binary files a/extension/module/test/resources/model.pte and /dev/null differ
diff --git a/extension/module/test/targets.bzl b/extension/module/test/targets.bzl
index f53a082add6..bc4ce2c6af7 100644
--- a/extension/module/test/targets.bzl
+++ b/extension/module/test/targets.bzl
@@ -1,3 +1,8 @@
+load(
+    "@fbsource//tools/build_defs:default_platform_defs.bzl",
+    "ANDROID",
+    "CXX",
+)
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def define_common_targets():
@@ -7,20 +12,28 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    runtime.cxx_test(
-        name = "test",
-        srcs = [
-            "module_test.cpp",
-        ],
-        deps = [
-            "//executorch/kernels/portable:generated_lib",
-            "//executorch/extension/data_loader:file_data_loader",
-            "//executorch/extension/module:module",
-        ],
-        env = {
-            "RESOURCES_PATH": "$(location :resources)/resources",
-        },
-    )
+    for aten_mode in (True, False):
+        aten_suffix = ("_aten" if aten_mode else "")
+
+        runtime.cxx_test(
+            name = "test" + aten_suffix,
+            srcs = [
+                "module_test.cpp",
+            ],
+            deps = [
+                "//executorch/kernels/portable:generated_lib" + aten_suffix,
+                "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/extension/module:module" + aten_suffix,
+                "//executorch/extension/tensor:tensor" + aten_suffix,
+            ],
+            env = {
+                "RESOURCES_PATH": "$(location :resources)/resources",
+            },
+            platforms = [CXX, ANDROID],  # Cannot bundle resources on Apple platform.
+            compiler_flags = [
+                "-Wno-error=deprecated-declarations",
+            ],
+        )
 
     runtime.filegroup(
         name = "resources",
diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl
index 7cb8a2d28a8..b1da51b6171 100644
--- a/extension/parallel/targets.bzl
+++ b/extension/parallel/targets.bzl
@@ -23,7 +23,7 @@ def define_common_targets():
                 "@EXECUTORCH_CLIENTS",
             ],
             deps = [
-                "//executorch/backends/xnnpack/threadpool:threadpool",
+                "//executorch/extension/threadpool:threadpool",
                 "//executorch/runtime/core:core",
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
             ],
diff --git a/extension/parallel/test/CMakeLists.txt b/extension/parallel/test/CMakeLists.txt
index 7c6f6a27d75..ab37f66c17d 100644
--- a/extension/parallel/test/CMakeLists.txt
+++ b/extension/parallel/test/CMakeLists.txt
@@ -21,14 +21,16 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
 include(${EXECUTORCH_ROOT}/build/Test.cmake)
 
-set(_test_srcs
-    thread_parallel_test.cpp ../thread_parallel.cpp
-    ${EXECUTORCH_ROOT}/backends/xnnpack/threadpool/threadpool.cpp
-    ${EXECUTORCH_ROOT}/backends/xnnpack/threadpool/threadpool_guard.cpp
-)
+set(_test_srcs thread_parallel_test.cpp ../thread_parallel.cpp)
 
 et_cxx_test(
-  extension_parallel_test SOURCES ${_test_srcs} EXTRA_LIBS pthreadpool cpuinfo
+  extension_parallel_test
+  SOURCES
+  ${_test_srcs}
+  EXTRA_LIBS
+  pthreadpool
+  cpuinfo
+  extension_threadpool
 )
 target_include_directories(
   extension_parallel_test
diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/parallel/test/thread_parallel_test.cpp
index 1eea87beb01..d386429100d 100644
--- a/extension/parallel/test/thread_parallel_test.cpp
+++ b/extension/parallel/test/thread_parallel_test.cpp
@@ -15,8 +15,7 @@
 #include <executorch/runtime/platform/platform.h>
 
 using namespace ::testing;
-
-namespace torch::executor {
+using ::executorch::extension::parallel_for;
 
 class ParallelTest : public ::testing::Test {
  protected:
@@ -192,5 +191,3 @@ TEST_F(ParallelTest, TestChunkSizeTooLarge) {
     EXPECT_EQ(data_[i], i);
   }
 }
-
-} // namespace torch::executor
diff --git a/extension/parallel/thread_parallel.cpp b/extension/parallel/thread_parallel.cpp
index cdd1d21a83c..fb2d3e7b0ae 100644
--- a/extension/parallel/thread_parallel.cpp
+++ b/extension/parallel/thread_parallel.cpp
@@ -8,12 +8,13 @@
 
 #include <tuple>
 
-#include <executorch/backends/xnnpack/threadpool/threadpool.h>
 #include <executorch/extension/parallel/thread_parallel.h>
+#include <executorch/extension/threadpool/threadpool.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/platform/assert.h>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
 
 namespace {
 thread_local int64_t thread_num_ = 0;
@@ -74,4 +75,5 @@ bool parallel_for(
   return true;
 }
 
-} // namespace torch::executor
+} // namespace extension
+} // namespace executorch
diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h
index 7b58236e8ab..bbce211597d 100644
--- a/extension/parallel/thread_parallel.h
+++ b/extension/parallel/thread_parallel.h
@@ -12,7 +12,8 @@
 // @nolint PATTERNLINT Ok to use stdlib for this optional library
 #include <functional>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
 
 /**
  * A helper to run function in parallel.
@@ -39,4 +40,15 @@ int64_t get_thread_num();
 
 void set_thread_num(int64_t thread_num);
 
-} // namespace torch::executor
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::get_thread_num;
+using ::executorch::extension::parallel_for;
+using ::executorch::extension::set_thread_num;
+} // namespace executor
+} // namespace torch
diff --git a/extension/pybindings/portable_lib.py b/extension/pybindings/portable_lib.py
index b9ed089f918..d094710e67e 100644
--- a/extension/pybindings/portable_lib.py
+++ b/extension/pybindings/portable_lib.py
@@ -6,6 +6,22 @@
 
 # pyre-strict
 
+"""API for loading and executing ExecuTorch PTE files using the C++ runtime.
+
+.. warning::
+
+    This API is experimental and subject to change without notice.
+"""
+
+import warnings as _warnings
+
+import executorch.exir._warnings as _exir_warnings
+
+_warnings.warn(
+    "This API is experimental and subject to change without notice.",
+    _exir_warnings.ExperimentalWarning,
+)
+
 # When installed as a pip wheel, we must import `torch` before trying to import
 # the pybindings shared library extension. This will load libtorch.so and
 # related libs, ensuring that the pybindings lib can resolve those runtime
@@ -15,6 +31,8 @@
 # Let users import everything from the C++ _portable_lib extension as if this
 # python file defined them. Although we could import these dynamically, it
 # wouldn't preserve the static type annotations.
+#
+# Note that all of these are experimental, and subject to change without notice.
 from executorch.extension.pybindings._portable_lib import (  # noqa: F401
     # Disable "imported but unused" (F401) checks.
     _create_profile_block,  # noqa: F401
@@ -32,3 +50,5 @@
 # Clean up so that `dir(portable_lib)` is the same as `dir(_portable_lib)`
 # (apart from some __dunder__ names).
 del _torch
+del _exir_warnings
+del _warnings
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index 9a49e3878ea..d674f2fe58c 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -17,6 +17,9 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include <executorch/devtools/bundled_program/bundled_program.h>
+#include <executorch/devtools/bundled_program/schema/bundled_program_schema_generated.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/extension/data_loader/mmap_data_loader.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
@@ -28,10 +31,6 @@
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/sdk/bundled_program/bundled_program.h>
-#include <executorch/sdk/bundled_program/schema/bundled_program_schema_generated.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
-#include <executorch/util/read_file.h>
 
 #include <ATen/Functions.h>
 #include <ATen/Tensor.h>
@@ -72,8 +71,38 @@ void et_pal_emit_log_message(
 }
 
 namespace py = pybind11;
-namespace torch {
-namespace executor {
+using executorch::bundled_program::verify_method_outputs;
+using ::executorch::extension::BufferDataLoader;
+using ::executorch::extension::MallocMemoryAllocator;
+using ::executorch::extension::MmapDataLoader;
+using ::executorch::runtime::ArrayRef;
+using ::executorch::runtime::DataLoader;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::EventTracerDebugLogLevel;
+using ::executorch::runtime::get_registered_kernels;
+using ::executorch::runtime::HierarchicalAllocator;
+using ::executorch::runtime::Kernel;
+using ::executorch::runtime::MemoryAllocator;
+using ::executorch::runtime::MemoryManager;
+using ::executorch::runtime::Method;
+using ::executorch::runtime::prof_result_t;
+using ::executorch::runtime::Program;
+using ::executorch::runtime::Result;
+using ::executorch::runtime::Span;
+using ::executorch::runtime::Tag;
+using torch::executor::etdump_result;
+using torch::executor::ETDumpGen;
+
+#ifndef USE_ATEN_LIB
+using ::executorch::extension::alias_attensor_to_etensor;
+using ::executorch::extension::alias_etensor_to_attensor;
+using ::executorch::extension::torch_to_executorch_scalar_type;
+#endif // !USE_ATEN_LIB
+
+namespace executorch {
+namespace extension {
+namespace pybindings {
 
 namespace {
 
@@ -96,7 +125,7 @@ void write_data_to_file(const std::string& path, void* buf, size_t size) {
 }
 
 void setup_output_storage(
-    executor::Method& method,
+    Method& method,
     const std::vector<Span<uint8_t>>& output_storages) {
   if (output_storages.size() != method.outputs_size()) {
     THROW_IF_ERROR(
@@ -123,10 +152,6 @@ void setup_output_storage(
   }
 }
 
-using util::BufferDataLoader;
-using util::MallocMemoryAllocator;
-using util::MmapDataLoader;
-
 class Module final {
  public:
   explicit Module(
@@ -136,7 +161,7 @@ class Module final {
       : loader_(std::move(loader)),
         event_tracer_(std::move(tracer)),
         debug_buffer_size_(debug_buffer_size) {
-    runtime_init();
+    ::executorch::runtime::runtime_init();
     Result<Program> program = Program::load(
         loader_.get(), Program::Verification::InternalConsistency);
     THROW_IF_ERROR(
@@ -346,12 +371,12 @@ class Module final {
   size_t debug_buffer_size_;
 };
 
-inline std::unique_ptr<Module> load_from_buffer(
+inline std::unique_ptr<Module> load_module_from_buffer(
     const void* ptr,
     size_t ptr_len,
     bool enable_etdump,
     size_t debug_buffer_size) {
-  EXECUTORCH_SCOPE_PROF("load_from_buffer");
+  EXECUTORCH_SCOPE_PROF("load_module_from_buffer");
   auto loader = std::make_unique<BufferDataLoader>(ptr, ptr_len);
   return std::make_unique<Module>(
       std::move(loader),
@@ -359,11 +384,11 @@ inline std::unique_ptr<Module> load_from_buffer(
       debug_buffer_size);
 }
 
-inline std::unique_ptr<Module> load_from_file(
+inline std::unique_ptr<Module> load_module_from_file(
     const std::string& path,
     bool enable_etdump,
     size_t debug_buffer_size) {
-  EXECUTORCH_SCOPE_PROF("load_from_file");
+  EXECUTORCH_SCOPE_PROF("load_module_from_file");
 
   Result<MmapDataLoader> res = MmapDataLoader::from(
       path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
@@ -428,7 +453,7 @@ struct PyModule final {
       const py::bytes& buffer,
       bool enable_etdump,
       size_t debug_buffer_size = 0)
-      : module_(torch::executor::load_from_buffer(
+      : module_(load_module_from_buffer(
             buffer.cast<std::string_view>().data(),
             py::len(buffer),
             enable_etdump,
@@ -439,7 +464,7 @@ struct PyModule final {
       size_t ptr_len,
       bool enable_etdump,
       size_t debug_buffer_size = 0)
-      : module_(torch::executor::load_from_buffer(
+      : module_(load_module_from_buffer(
             ptr,
             ptr_len,
             enable_etdump,
@@ -449,10 +474,8 @@ struct PyModule final {
       const std::string& path,
       bool enable_etdump,
       size_t debug_buffer_size = 0)
-      : module_(torch::executor::load_from_file(
-            path,
-            enable_etdump,
-            debug_buffer_size)) {}
+      : module_(load_module_from_file(path, enable_etdump, debug_buffer_size)) {
+  }
 
   PyModule(const PyModule&) = delete;
   PyModule& operator=(const PyModule&) = delete;
@@ -486,7 +509,8 @@ struct PyModule final {
 
   py::list run_method(
       const std::string& method_name,
-      const py::sequence& inputs) {
+      const py::sequence& inputs,
+      bool clone_outputs = true) {
     const auto inputs_size = py::len(inputs);
     std::vector<EValue> cpp_inputs;
     cpp_inputs.reserve(inputs_size);
@@ -525,8 +549,8 @@ struct PyModule final {
         EValue evalue(at_tensor);
 #else
         // convert at::Tensor to torch::executor::Tensor
-        auto type = torch::util::torchToExecuTorchScalarType(
-            at_tensor.options().dtype());
+        auto type =
+            torch_to_executorch_scalar_type(at_tensor.options().dtype());
         size_t dim = at_tensor.dim();
         // cant directly alias at::Tensor sizes and strides due to int64 vs
         // int32 typing conflict
@@ -551,7 +575,7 @@ struct PyModule final {
 
         torch::executor::Tensor temp =
             torch::executor::Tensor(&input_tensors.back());
-        torch::util::alias_etensor_to_attensor(at_tensor, temp);
+        alias_etensor_to_attensor(at_tensor, temp);
         EValue evalue(temp);
 #endif
 
@@ -580,17 +604,19 @@ struct PyModule final {
         module_->run_method(method_name, cpp_inputs, output_storage_spans);
 
     // Retrieve outputs
-    return get_outputs_as_py_list(outputs);
+    return get_outputs_as_py_list(outputs, clone_outputs);
   }
 
-  py::list forward(const py::sequence& inputs) {
-    return run_method("forward", inputs);
+  py::list forward(const py::sequence& inputs, bool clone_outputs = true) {
+    return run_method("forward", inputs, clone_outputs);
   }
 
-  py::list forward_single_input(const torch::Tensor& inputTensor) {
+  py::list forward_single_input(
+      const torch::Tensor& inputTensor,
+      bool clone_outputs = true) {
     py::list py_list;
     py_list.append(py::cast(inputTensor));
-    return run_method("forward", py_list);
+    return run_method("forward", py_list, clone_outputs);
   }
 
   bool has_etdump() {
@@ -628,33 +654,33 @@ struct PyModule final {
 
   void load_bundled_input(
       PyBundledModule& m,
-      const string method_name,
+      const std::string method_name,
       size_t testset_idx) {
     const void* bundled_program_ptr = m.get_bundled_program_ptr();
-    Error status = bundled_program::LoadBundledInput(
+    Error status = executorch::bundled_program::load_bundled_input(
         module_->get_method(method_name), bundled_program_ptr, testset_idx);
     THROW_IF_ERROR(
         status,
-        "LoadBundledInput failed with status %" PRIu32,
+        "load_bundled_input failed with status 0x%" PRIx32,
         static_cast<uint32_t>(status));
   }
 
   py::list verify_result_with_bundled_expected_output(
       PyBundledModule& m,
-      const string method_name,
+      const std::string method_name,
       size_t testset_idx,
       double rtol = 1e-5,
       double atol = 1e-8) {
     const void* bundled_program_ptr = m.get_bundled_program_ptr();
     auto& method = module_->get_method(method_name);
-    Error status = bundled_program::LoadBundledInput(
+    Error status = executorch::bundled_program::load_bundled_input(
         method, bundled_program_ptr, testset_idx);
     THROW_IF_ERROR(
         status,
-        "LoadBundledInput failed with status %" PRIu32,
+        "load_bundled_input failed with status 0x%" PRIx32,
         static_cast<uint32_t>(status));
     py::list outputs = plan_execute(method_name);
-    status = bundled_program::VerifyResultWithBundledExpectedOutput(
+    status = executorch::bundled_program::verify_method_outputs(
         method, bundled_program_ptr, testset_idx, rtol, atol);
     THROW_IF_ERROR(
         status,
@@ -663,7 +689,9 @@ struct PyModule final {
     return outputs;
   }
 
-  py::list plan_execute(const string method_name) {
+  py::list plan_execute(
+      const std::string method_name,
+      bool clone_outputs = true) {
     auto& method = module_->get_method(method_name);
     // Need to pre-allocate space for outputs just like in run_method.
     const auto num_outputs = method.outputs_size();
@@ -680,10 +708,12 @@ struct PyModule final {
         "executing execution plan for method 'forward' failed with error: 0x%" PRIx32,
         static_cast<uint32_t>(status));
     const auto outputs = module_->get_outputs(method_name);
-    return get_outputs_as_py_list(outputs);
+    return get_outputs_as_py_list(outputs, clone_outputs);
   }
 
-  py::list get_outputs_as_py_list(const std::vector<EValue>& outputs) {
+  py::list get_outputs_as_py_list(
+      const std::vector<EValue>& outputs,
+      bool clone_outputs = true) {
     const auto outputs_size = outputs.size();
     py::list list(outputs_size);
     for (size_t i = 0; i < outputs_size; ++i) {
@@ -702,10 +732,17 @@ struct PyModule final {
 #ifdef USE_ATEN_LIB
         // Clone so the outputs in python do not share a lifetime with the
         // module object
-        list[i] = py::cast(v.toTensor().clone());
+        if (clone_outputs) {
+          list[i] = py::cast(v.toTensor().clone());
+        } else {
+          list[i] = py::cast(v.toTensor());
+        }
 #else
-        list[i] = py::cast(
-            torch::util::alias_attensor_to_etensor(v.toTensor()).clone());
+        if (clone_outputs) {
+          list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()).clone());
+        } else {
+          list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()));
+        }
 #endif
       } else {
         ET_ASSERT_UNREACHABLE_MSG("Invalid model output type");
@@ -720,8 +757,7 @@ struct PyModule final {
   // bundled programs.
   std::vector<std::vector<uint8_t>> output_storages_;
 
-  std::vector<std::vector<uint8_t>> make_output_storages(
-      const executor::Method& method) {
+  std::vector<std::vector<uint8_t>> make_output_storages(const Method& method) {
     const auto num_outputs = method.outputs_size();
     // These output storages will not be used if the ExecuTorch program already
     // pre-allocated output space. That is represented by an error from
@@ -753,7 +789,7 @@ void create_profile_block(const std::string& name) {
 }
 
 py::list get_operator_names() {
-  ArrayRef<Kernel> kernels = get_kernels();
+  Span<const Kernel> kernels = get_registered_kernels();
   py::list res;
   for (const Kernel& k : kernels) {
     if (k.name_ != nullptr) {
@@ -824,14 +860,25 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
           py::arg("rtol") = 1e-5,
           py::arg("atol") = 1e-8,
           call_guard)
-      .def("plan_execute", &PyModule::plan_execute, call_guard)
+      .def(
+          "plan_execute",
+          &PyModule::plan_execute,
+          py::arg("method_name"),
+          py::arg("clone_outputs") = true,
+          call_guard)
       .def(
           "run_method",
           &PyModule::run_method,
           py::arg("method_name"),
           py::arg("inputs") = py::list(),
+          py::arg("clone_outputs") = true,
+          call_guard)
+      .def(
+          "forward",
+          &PyModule::forward,
+          py::arg("inputs") = py::list(),
+          py::arg("clone_outputs") = true,
           call_guard)
-      .def("forward", &PyModule::forward, call_guard)
       .def("has_etdump", &PyModule::has_etdump, call_guard)
       .def(
           "write_etdump_result_to_file",
@@ -839,11 +886,22 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
           py::arg("path"),
           py::arg("debug_buffer_path") = py::none(),
           call_guard)
-      .def("__call__", &PyModule::forward, call_guard)
-      .def("__call__", &PyModule::forward_single_input, call_guard);
+      .def(
+          "__call__",
+          &PyModule::forward,
+          py::arg("inputs") = py::list(),
+          py::arg("clone_outputs") = true,
+          call_guard)
+      .def(
+          "__call__",
+          &PyModule::forward_single_input,
+          py::arg("inputs") = py::list(),
+          py::arg("clone_outputs") = true,
+          call_guard);
 
   py::class_<PyBundledModule>(m, "BundledModule");
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace pybindings
+} // namespace extension
+} // namespace executorch
diff --git a/extension/pybindings/pybindings.pyi b/extension/pybindings/pybindings.pyi
index e02ae0046f1..71f5598ad9d 100644
--- a/extension/pybindings/pybindings.pyi
+++ b/extension/pybindings/pybindings.pyi
@@ -7,7 +7,17 @@
 # pyre-strict
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 
+from executorch.exir._warnings import experimental
+
+@experimental("This API is experimental and subject to change without notice.")
 class ExecuTorchModule:
+    """ExecuTorchModule is a Python wrapper around a C++ ExecuTorch program.
+
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+
     # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
     def __call__(self, inputs: Any) -> List[Any]: ...
     # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
@@ -34,12 +44,26 @@ class ExecuTorchModule:
         self, path: str, debug_buffer_path: Optional[str] = None
     ) -> None: ...
 
-class BundledModule: ...
+@experimental("This API is experimental and subject to change without notice.")
+class BundledModule:
+    """
+    .. warning::
 
+        This API is experimental and subject to change without notice.
+    """
+
+    ...
+
+@experimental("This API is experimental and subject to change without notice.")
 def _load_for_executorch(
     path: str, enable_etdump: bool = False, debug_buffer_size: int = 0
 ) -> ExecuTorchModule:
     """Load an ExecuTorch Program from a file.
+
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+
     Args:
         path: File path to the ExecuTorch program as a string.
         enable_etdump: If true, enables an ETDump which can store profiling information.
@@ -53,23 +77,75 @@ def _load_for_executorch(
     """
     ...
 
+@experimental("This API is experimental and subject to change without notice.")
 def _load_for_executorch_from_buffer(
     buffer: bytes, enable_etdump: bool = False, debug_buffer_size: int = 0
 ) -> ExecuTorchModule:
-    """Same as _load_for_executorch, but takes a byte buffer instead of a file path."""
+    """Same as _load_for_executorch, but takes a byte buffer instead of a file path.
+
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
     ...
 
+@experimental("This API is experimental and subject to change without notice.")
 def _load_for_executorch_from_bundled_program(
     module: BundledModule, enable_etdump: bool = False, debug_buffer_size: int = 0
 ) -> ExecuTorchModule:
     """Same as _load_for_executorch, but takes a bundled program instead of a file path.
-    See https://pytorch.org/executorch/stable/sdk-bundled-io.html for documentation."""
+
+    See https://pytorch.org/executorch/stable/sdk-bundled-io.html for documentation.
+
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
     ...
 
+@experimental("This API is experimental and subject to change without notice.")
 def _load_bundled_program_from_buffer(
     buffer: bytes, non_const_pool_size: int = ...
-) -> BundledModule: ...
-def _get_operator_names() -> List[str]: ...
-def _create_profile_block(name: str) -> None: ...
-def _dump_profile_results() -> bytes: ...
-def _reset_profile_results() -> None: ...
+) -> BundledModule:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
+
+@experimental("This API is experimental and subject to change without notice.")
+def _get_operator_names() -> List[str]:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
+
+@experimental("This API is experimental and subject to change without notice.")
+def _create_profile_block(name: str) -> None:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
+
+@experimental("This API is experimental and subject to change without notice.")
+def _dump_profile_results() -> bytes:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
+
+@experimental("This API is experimental and subject to change without notice.")
+def _reset_profile_results() -> None:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
diff --git a/extension/pybindings/test/TARGETS b/extension/pybindings/test/TARGETS
index c569b97dcb5..feb4779a05e 100644
--- a/extension/pybindings/test/TARGETS
+++ b/extension/pybindings/test/TARGETS
@@ -28,6 +28,7 @@ runtime.python_library(
 runtime.python_test(
     name = "test_pybindings_portable_lib",
     srcs = ["test_pybindings.py"],
+    preload_deps = ["//executorch/kernels/quantized:aot_lib"],
     deps = [
         ":make_test",
         "//executorch/extension/pybindings:portable_lib",
@@ -37,8 +38,10 @@ runtime.python_test(
 runtime.python_test(
     name = "test_pybindings_aten_lib",
     srcs = ["test_pybindings.py"],
+    preload_deps = ["//executorch/kernels/quantized:aot_lib"],
     deps = [
         ":make_test",
         "//executorch/extension/pybindings:aten_lib",
+        "//executorch/kernels/quantized:aot_lib",
     ],
 )
diff --git a/extension/pybindings/test/make_test.py b/extension/pybindings/test/make_test.py
index a977b373a45..44e41ed443e 100644
--- a/extension/pybindings/test/make_test.py
+++ b/extension/pybindings/test/make_test.py
@@ -4,11 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import unittest
-from typing import Any, Callable, Tuple
+from typing import Any, Callable, Optional, Tuple
 
 import torch
-from executorch.exir import ExecutorchProgramManager, to_edge
+from executorch.exir import ExecutorchBackendConfig, ExecutorchProgramManager, to_edge
+from executorch.exir.passes import MemoryPlanningPass
 from torch.export import export
 
 
@@ -73,8 +76,25 @@ def get_methods_to_export(self):
             def get_inputs(self):
                 return (torch.ones(2, 2),)
 
+        class ModuleAddConstReturn(torch.nn.Module):
+            """The module to serialize and execute."""
+
+            def __init__(self):
+                super(ModuleAddConstReturn, self).__init__()
+                self.state = torch.ones(2, 2)
+
+            def forward(self, x):
+                return x + self.state, self.state
+
+            def get_methods_to_export(self):
+                return ("forward",)
+
+            def get_inputs(self):
+                return (torch.ones(2, 2),)
+
         def create_program(
             eager_module: torch.nn.Module,
+            et_config: Optional[ExecutorchBackendConfig] = None,
         ) -> Tuple[ExecutorchProgramManager, Tuple[Any, ...]]:
             """Returns an executorch program based on ModuleAdd, along with inputs."""
 
@@ -101,7 +121,7 @@ def forward(self, *args, **kwargs):
                 )
                 exported_methods[method_name] = export(wrapped_mod, method_input)
 
-            exec_prog = to_edge(exported_methods).to_executorch()
+            exec_prog = to_edge(exported_methods).to_executorch(config=et_config)
 
             # Create the ExecuTorch program from the graph.
             exec_prog.dump_executorch_program(verbose=True)
@@ -211,11 +231,79 @@ def __str__(self):
                 except Exception:
                     tester.assertTrue(str(out).find("The length of given input array"))
 
+        def test_quantized_ops(tester):
+            eager_module = ModuleAdd()
+
+            from executorch.exir import EdgeCompileConfig
+            from executorch.exir.passes.quant_fusion_pass import QuantFusionPass
+            from torch.ao.quantization import get_default_qconfig_mapping
+            from torch.ao.quantization.backend_config.executorch import (
+                get_executorch_backend_config,
+            )
+            from torch.ao.quantization.quantize_fx import (
+                _convert_to_reference_decomposed_fx,
+                prepare_fx,
+            )
+
+            qconfig_mapping = get_default_qconfig_mapping("qnnpack")
+            example_inputs = (
+                torch.ones(1, 5, dtype=torch.float32),
+                torch.ones(1, 5, dtype=torch.float32),
+            )
+            m = prepare_fx(
+                eager_module,
+                qconfig_mapping,
+                example_inputs,
+                backend_config=get_executorch_backend_config(),
+            )
+            m = _convert_to_reference_decomposed_fx(m)
+            config = EdgeCompileConfig(_check_ir_validity=False)
+            m = to_edge(export(m, example_inputs), compile_config=config)
+            m = m.transform([QuantFusionPass(_fix_node_meta_val=True)])
+
+            exec_prog = m.to_executorch()
+
+            executorch_module = load_fn(exec_prog.buffer)
+            executorch_output = executorch_module.forward(example_inputs)[0]
+
+            expected = example_inputs[0] + example_inputs[1]
+            tester.assertEqual(str(expected), str(executorch_output))
+
+        def test_constant_output_not_memory_planned(tester):
+            # Create an ExecuTorch program from ModuleAdd.
+            exported_program, inputs = create_program(
+                ModuleAddConstReturn(),
+                et_config=ExecutorchBackendConfig(
+                    memory_planning_pass=MemoryPlanningPass(alloc_graph_output=False)
+                ),
+            )
+
+            exported_program.dump_executorch_program(verbose=True)
+
+            # Use pybindings to load and execute the program.
+            executorch_module = load_fn(exported_program.buffer)
+            # Invoke the callable on executorch_module instead of calling module.forward.
+            # Use only one input to test this case.
+            executorch_output = executorch_module((torch.ones(2, 2),))
+            print(executorch_output)
+
+            # The test module adds the input to torch.ones(2,2), so its output should be the same
+            # as adding them directly.
+            expected = torch.ones(2, 2) + torch.ones(2, 2)
+            tester.assertEqual(str(expected), str(executorch_output[0]))
+
+            # The test module returns the state. Check that its value is correct.
+            tester.assertEqual(str(torch.ones(2, 2)), str(executorch_output[1]))
+
+        ######### RUN TEST CASES #########
+
         test_e2e(tester)
         test_multiple_entry(tester)
         test_output_lifespan(tester)
         test_module_callable(tester)
         test_module_single_input(tester)
         test_stderr_redirect(tester)
+        test_quantized_ops(tester)
+        test_constant_output_not_memory_planned(tester)
 
     return wrapper
diff --git a/extension/pybindings/test/test_pybindings.py b/extension/pybindings/test/test_pybindings.py
index dbc2b057bfb..d4ce2af0390 100644
--- a/extension/pybindings/test/test_pybindings.py
+++ b/extension/pybindings/test/test_pybindings.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import unittest
 
 kernel_mode = None  # either aten mode or portable mode
diff --git a/extension/pytree/aten_util/ivalue_util.cpp b/extension/pytree/aten_util/ivalue_util.cpp
index 6935d45e928..c4d11c13ee0 100644
--- a/extension/pytree/aten_util/ivalue_util.cpp
+++ b/extension/pytree/aten_util/ivalue_util.cpp
@@ -10,13 +10,12 @@
 
 #include <executorch/runtime/platform/assert.h>
 
-namespace torch {
-namespace executor {
-namespace util {
+namespace executorch {
+namespace extension {
 
 using namespace c10;
 using namespace at;
-using namespace torch::executor::pytree;
+using namespace executorch::extension::pytree;
 
 ContainerHandle<IValue> getContainerHandle(const IValue& data) {
   if (data.isList()) {
@@ -214,6 +213,5 @@ bool is_same(const IValue& lhs, const IValue& rhs) {
   return at::all(l == r).item<bool>();
 }
 
-} // namespace util
-} // namespace executor
-} // namespace torch
+} // namespace extension
+} // namespace executorch
diff --git a/extension/pytree/aten_util/ivalue_util.h b/extension/pytree/aten_util/ivalue_util.h
index 7798a3a4b35..b797f13e8fd 100644
--- a/extension/pytree/aten_util/ivalue_util.h
+++ b/extension/pytree/aten_util/ivalue_util.h
@@ -19,20 +19,19 @@
 
 #include <executorch/extension/pytree/pytree.h>
 
-namespace torch {
-namespace executor {
-namespace util {
-
-using Empty = torch::executor::pytree::Empty;
+namespace executorch {
+namespace extension {
 
 std::pair<
     std::vector<at::Tensor>,
-    std::unique_ptr<torch::executor::pytree::TreeSpec<Empty>>>
+    std::unique_ptr<::executorch::extension::pytree::TreeSpec<
+        ::executorch::extension::pytree::Empty>>>
 flatten(const c10::IValue& data);
 
 c10::IValue unflatten(
     const std::vector<at::Tensor>& tensors,
-    const std::unique_ptr<torch::executor::pytree::TreeSpec<Empty>>& tree_spec);
+    const std::unique_ptr<::executorch::extension::pytree::TreeSpec<
+        ::executorch::extension::pytree::Empty>>& tree_spec);
 
 bool is_same(
     const std::vector<at::Tensor>& a,
@@ -40,6 +39,17 @@ bool is_same(
 
 bool is_same(const c10::IValue& lhs, const c10::IValue& rhs);
 
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace util {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::flatten;
+using ::executorch::extension::is_same;
+using ::executorch::extension::unflatten;
 } // namespace util
 } // namespace executor
 } // namespace torch
diff --git a/extension/pytree/aten_util/test/ivalue_util_test.cpp b/extension/pytree/aten_util/test/ivalue_util_test.cpp
index 92954f2af85..8b35f173e31 100644
--- a/extension/pytree/aten_util/test/ivalue_util_test.cpp
+++ b/extension/pytree/aten_util/test/ivalue_util_test.cpp
@@ -9,9 +9,9 @@
 #include <executorch/extension/pytree/aten_util/ivalue_util.h>
 #include <gtest/gtest.h>
 
-using namespace c10;
-using namespace torch::executor::pytree;
-using namespace torch::executor::util;
+using executorch::extension::flatten;
+using executorch::extension::is_same;
+using executorch::extension::unflatten;
 
 std::vector<at::Tensor> makeExampleTensors(size_t N) {
   std::vector<at::Tensor> tensors;
@@ -22,7 +22,7 @@ std::vector<at::Tensor> makeExampleTensors(size_t N) {
 }
 
 struct TestCase {
-  IValue ivalue;
+  c10::IValue ivalue;
   std::vector<at::Tensor> tensors;
 };
 
@@ -54,7 +54,7 @@ TestCase makeExampleDictOfTensors() {
 TestCase makeExampleComposite() {
   auto tensors = makeExampleTensors(8);
 
-  IValue list = c10::List<at::Tensor>{
+  c10::IValue list = c10::List<at::Tensor>{
       tensors[1],
       tensors[2],
   };
@@ -100,7 +100,7 @@ void testUnflatten(const TestCase& testcase) {
   auto ret = flatten(testcase.ivalue);
 
   // then we unflatten it
-  IValue unflattened = unflatten(ret.first, ret.second);
+  c10::IValue unflattened = unflatten(ret.first, ret.second);
 
   // and see if we got the same IValue back
   ASSERT_TRUE(is_same(unflattened, testcase.ivalue));
diff --git a/extension/pytree/function_ref.h b/extension/pytree/function_ref.h
index 01d2988597a..0458610c4db 100644
--- a/extension/pytree/function_ref.h
+++ b/extension/pytree/function_ref.h
@@ -38,14 +38,16 @@
 #include <type_traits>
 #include <utility>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
 namespace pytree {
 
 //===----------------------------------------------------------------------===//
 //     Features from C++20
 //===----------------------------------------------------------------------===//
 
+namespace internal {
+
 template <typename T>
 struct remove_cvref {
   using type =
@@ -55,6 +57,8 @@ struct remove_cvref {
 template <typename T>
 using remove_cvref_t = typename remove_cvref<T>::type;
 
+} // namespace internal
+
 template <typename Fn>
 class FunctionRef;
 
@@ -79,7 +83,7 @@ class FunctionRef<Ret(Params...)> {
       typename Callable,
       // This is not the copy-constructor.
       typename std::enable_if<
-          !std::is_same<remove_cvref_t<Callable>, FunctionRef>::value,
+          !std::is_same<internal::remove_cvref_t<Callable>, FunctionRef>::value,
           int32_t>::type = 0,
       // Avoid lvalue reference to non-capturing lambda.
       typename std::enable_if<
@@ -153,6 +157,16 @@ class FunctionRef<Ret(Params...)> {
   }
 };
 
+} // namespace pytree
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace pytree {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::pytree::FunctionRef;
 } // namespace pytree
 } // namespace executor
 } // namespace torch
diff --git a/extension/pytree/pybindings.cpp b/extension/pytree/pybindings.cpp
index 9bcf6043b77..931943e489e 100644
--- a/extension/pytree/pybindings.cpp
+++ b/extension/pytree/pybindings.cpp
@@ -15,8 +15,8 @@
 
 namespace py = pybind11;
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
 namespace pytree {
 
 namespace {
@@ -395,5 +395,5 @@ PYBIND11_MODULE(pybindings, m) {
 }
 
 } // namespace pytree
-} // namespace executor
-} // namespace torch
+} // namespace extension
+} // namespace executorch
diff --git a/extension/pytree/pytree.h b/extension/pytree/pytree.h
index 127310254d7..78e2305fe3e 100644
--- a/extension/pytree/pytree.h
+++ b/extension/pytree/pytree.h
@@ -19,8 +19,8 @@
 // NB: This is a local, pytree FunctionRef and not from the ExecuTorch runtime.
 #include <executorch/extension/pytree/function_ref.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
 namespace pytree {
 
 inline void pytree_assert(bool must_be_true) {
@@ -738,6 +738,18 @@ std::pair<arr<T*>, std::unique_ptr<TreeSpec<Aux>>> flatten(
       std::make_unique<TreeSpec<Aux>>(clone(tree, spec_leaves.get()))};
 }
 
+} // namespace pytree
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace pytree {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::pytree::Empty;
+using ::executorch::extension::pytree::from_str;
+using ::executorch::extension::pytree::TreeSpec;
 } // namespace pytree
 } // namespace executor
 } // namespace torch
diff --git a/extension/pytree/test/TARGETS b/extension/pytree/test/TARGETS
index c281994cea4..190bdb0bc67 100644
--- a/extension/pytree/test/TARGETS
+++ b/extension/pytree/test/TARGETS
@@ -5,7 +5,7 @@ load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
 oncall("executorch")
 
 cpp_unittest(
-    name = "cpptest",
+    name = "pytree_test",
     srcs = ["test_pytree.cpp"],
     deps = ["//executorch/extension/pytree:pytree"],
 )
@@ -17,7 +17,7 @@ cpp_unittest(
 )
 
 python_unittest(
-    name = "test",
+    name = "pybindings_test",
     srcs = [
         "test.py",
     ],
diff --git a/extension/pytree/test/function_ref_test.cpp b/extension/pytree/test/function_ref_test.cpp
index f847c8ebd78..a3cdbd824bf 100644
--- a/extension/pytree/test/function_ref_test.cpp
+++ b/extension/pytree/test/function_ref_test.cpp
@@ -6,15 +6,13 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <gtest/gtest.h>
-
 #include <executorch/extension/pytree/function_ref.h>
 
+#include <gtest/gtest.h>
+
 using namespace ::testing;
 
-namespace torch {
-namespace executor {
-namespace pytree {
+using ::executorch::extension::pytree::FunctionRef;
 
 namespace {
 class Item {
@@ -84,7 +82,3 @@ TEST(FunctionRefTest, FunctionPointer) {
   Item item1(0, &one);
   EXPECT_EQ(item1.get(), 1);
 }
-
-} // namespace pytree
-} // namespace executor
-} // namespace torch
diff --git a/extension/pytree/test/test_pytree.cpp b/extension/pytree/test/test_pytree.cpp
index 5f8ab72acf1..0101bca3f55 100644
--- a/extension/pytree/test/test_pytree.cpp
+++ b/extension/pytree/test/test_pytree.cpp
@@ -6,19 +6,15 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <gtest/gtest.h>
-#include <string>
-
 #include <executorch/extension/pytree/pytree.h>
 
-int main(int argc, char* argv[]) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
+#include <gtest/gtest.h>
+#include <string>
 
-namespace torch {
-namespace executor {
-namespace pytree {
+using ::executorch::extension::pytree::ContainerHandle;
+using ::executorch::extension::pytree::Key;
+using ::executorch::extension::pytree::Kind;
+using ::executorch::extension::pytree::unflatten;
 
 using Leaf = int32_t;
 
@@ -187,7 +183,3 @@ TEST(pytree, FlattenNestedDict) {
     ASSERT_EQ(*leaves[i], items[i]);
   }
 }
-
-} // namespace pytree
-} // namespace executor
-} // namespace torch
diff --git a/extension/runner_util/inputs.cpp b/extension/runner_util/inputs.cpp
index 103d3461f0a..c33716be679 100644
--- a/extension/runner_util/inputs.cpp
+++ b/extension/runner_util/inputs.cpp
@@ -12,9 +12,15 @@
 #include <executorch/runtime/executor/method_meta.h>
 #include <executorch/runtime/platform/log.h>
 
-namespace torch {
-namespace executor {
-namespace util {
+using executorch::runtime::Error;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+using executorch::runtime::Tag;
+using executorch::runtime::TensorInfo;
+
+namespace executorch {
+namespace extension {
 
 Result<BufferCleanup> prepare_input_tensors(Method& method) {
   MethodMeta method_meta = method.method_meta();
@@ -25,6 +31,7 @@ Result<BufferCleanup> prepare_input_tensors(Method& method) {
   for (size_t i = 0; i < num_inputs; i++) {
     auto tag = method_meta.input_tag(i);
     if (!tag.ok()) {
+      BufferCleanup cleanup({inputs, num_allocated});
       return tag.error();
     }
     if (tag.get() != Tag::Tensor) {
@@ -53,6 +60,5 @@ Result<BufferCleanup> prepare_input_tensors(Method& method) {
   return BufferCleanup({inputs, num_allocated});
 }
 
-} // namespace util
-} // namespace executor
-} // namespace torch
+} // namespace extension
+} // namespace executorch
diff --git a/extension/runner_util/inputs.h b/extension/runner_util/inputs.h
index 316ae53cb52..b933bca8073 100644
--- a/extension/runner_util/inputs.h
+++ b/extension/runner_util/inputs.h
@@ -12,9 +12,8 @@
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/method_meta.h>
 
-namespace torch {
-namespace executor {
-namespace util {
+namespace executorch {
+namespace extension {
 
 /**
  * RAII helper that frees a set of buffers when destroyed. Movable.
@@ -25,14 +24,15 @@ class BufferCleanup final {
    * Takes ownership of `buffers.data()` and the elements of `buffers`, which
    * each will be passed to `free()` when the object is destroyed.
    */
-  explicit BufferCleanup(Span<void*> buffers) : buffers_(buffers) {}
+  explicit BufferCleanup(executorch::runtime::Span<void*> buffers)
+      : buffers_(buffers) {}
 
   /**
    * Move ctor. Takes ownership of the data previously owned by `rhs`, leaving
    * `rhs` with an empty list of buffers.
    */
   BufferCleanup(BufferCleanup&& rhs) noexcept : buffers_(rhs.buffers_) {
-    rhs.buffers_ = Span<void*>();
+    rhs.buffers_ = executorch::runtime::Span<void*>();
   }
 
   ~BufferCleanup() {
@@ -48,7 +48,7 @@ class BufferCleanup final {
   BufferCleanup& operator=(const BufferCleanup&) = delete;
   BufferCleanup& operator=(BufferCleanup&&) noexcept = delete;
 
-  Span<void*> buffers_;
+  executorch::runtime::Span<void*> buffers_;
 };
 
 /**
@@ -61,20 +61,31 @@ class BufferCleanup final {
  *     remain alive when calling `method->execute()`.
  * @returns An error on failure.
  */
-Result<BufferCleanup> prepare_input_tensors(Method& method);
+executorch::runtime::Result<BufferCleanup> prepare_input_tensors(
+    executorch::runtime::Method& method);
 
 namespace internal {
 /**
  * INTERNAL-ONLY: Creates a Tensor using the provided shape and buffer,
  * fills it with ones, and sets the input at `input_index`.
  */
-Error fill_and_set_input(
-    Method& method,
-    TensorInfo& tensor_meta,
+executorch::runtime::Error fill_and_set_input(
+    executorch::runtime::Method& method,
+    executorch::runtime::TensorInfo& tensor_meta,
     size_t input_index,
     void* data_ptr);
 } // namespace internal
 
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace util {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::BufferCleanup;
+using ::executorch::extension::prepare_input_tensors;
 } // namespace util
 } // namespace executor
 } // namespace torch
diff --git a/extension/runner_util/inputs_aten.cpp b/extension/runner_util/inputs_aten.cpp
index d9ccb63fd52..83d12dac42d 100644
--- a/extension/runner_util/inputs_aten.cpp
+++ b/extension/runner_util/inputs_aten.cpp
@@ -14,10 +14,12 @@
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/method_meta.h>
 
-namespace torch {
-namespace executor {
-namespace util {
+using executorch::runtime::Error;
+using executorch::runtime::Method;
+using executorch::runtime::TensorInfo;
 
+namespace executorch {
+namespace extension {
 namespace internal {
 
 Error fill_and_set_input(
@@ -38,7 +40,5 @@ Error fill_and_set_input(
 }
 
 } // namespace internal
-
-} // namespace util
-} // namespace executor
-} // namespace torch
+} // namespace extension
+} // namespace executorch
diff --git a/extension/runner_util/inputs_portable.cpp b/extension/runner_util/inputs_portable.cpp
index f799a968cf0..f9db03bcd1d 100644
--- a/extension/runner_util/inputs_portable.cpp
+++ b/extension/runner_util/inputs_portable.cpp
@@ -16,9 +16,14 @@
 #include <executorch/runtime/executor/method_meta.h>
 #include <executorch/runtime/platform/log.h>
 
-namespace torch {
-namespace executor {
-namespace util {
+using exec_aten::Tensor;
+using exec_aten::TensorImpl;
+using executorch::runtime::Error;
+using executorch::runtime::Method;
+using executorch::runtime::TensorInfo;
+
+namespace executorch {
+namespace extension {
 namespace internal {
 
 namespace {
@@ -68,6 +73,5 @@ Error fill_and_set_input(
 }
 
 } // namespace internal
-} // namespace util
-} // namespace executor
-} // namespace torch
+} // namespace extension
+} // namespace executorch
diff --git a/extension/runner_util/managed_tensor.h b/extension/runner_util/managed_tensor.h
deleted file mode 100644
index 16a84a13df4..00000000000
--- a/extension/runner_util/managed_tensor.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-#include <executorch/runtime/platform/assert.h>
-#include <memory>
-// NOTE: required by torchchat install_et.sh script.
-// @nolint PATTERNLINT Ok to use stdlib for this optional library
-#include <vector>
-
-#ifdef USE_ATEN_LIB
-#include <torch/torch.h>
-#else
-#include <executorch/runtime/core/portable_type/tensor.h>
-#endif
-#pragma once
-
-namespace torch {
-namespace executor {
-
-/**
- * A tensor wrapper takes ownership of all the memory of the necessary metadata
- * for torch::executor::Tensor. Note that it doesn't own the data memory.
- */
-class ManagedTensor {
- public:
-  /// The type used for elements of `sizes()`.
-  using SizesType = exec_aten::SizesType;
-  /// The type used for elements of `dim_order()`.
-  using DimOrderType = exec_aten::DimOrderType;
-  /// The type used for elements of `strides()`.
-  using StridesType = exec_aten::StridesType;
-
-  ManagedTensor() = delete;
-
-  explicit ManagedTensor(
-      void* data,
-      const std::vector<SizesType>& sizes,
-      ScalarType dtype)
-      : sizes_(sizes) {
-#ifdef USE_ATEN_LIB
-    tensor_ = torch::from_blob(data, sizes, dtype);
-#else
-    // Calculate strides.
-    strides_ = std::vector<StridesType>(sizes_.size());
-    if (sizes_.size() > 0) {
-      strides_.back() = 1;
-      for (size_t i = strides_.size() - 1; i > 0; --i) {
-        strides_[i - 1] = strides_[i] * sizes_[i];
-      }
-    }
-
-    // Allocate TensorImpl.
-    tensor_impl_ = std::make_unique<TensorImpl>(
-        dtype,
-        sizes_.size(),
-        sizes_.data(),
-        data,
-        /*dim_order=*/nullptr,
-        strides_.data(),
-        TensorShapeDynamism::DYNAMIC_BOUND);
-#endif
-  }
-
-  void resize(const std::vector<SizesType>& new_sizes) {
-    auto err = resize_tensor(
-        this->get_aliasing_tensor(),
-        exec_aten::ArrayRef<SizesType>(new_sizes.data(), new_sizes.size()));
-    ET_CHECK(err == Error::Ok);
-  }
-
-  /**
-   * Get the underlying Tensor object. This is assuming the copying is cheap.
-   */
-  Tensor get_aliasing_tensor() {
-#ifdef USE_ATEN_LIB
-    return tensor_;
-#else
-    return Tensor(tensor_impl_.get());
-#endif
-  }
-
- private:
-  std::unique_ptr<TensorImpl> tensor_impl_;
-  std::vector<SizesType> sizes_;
-  std::vector<StridesType> strides_;
-#ifdef USE_ATEN_LIB
-  Tensor tensor_;
-#endif
-};
-
-} // namespace executor
-} // namespace torch
diff --git a/extension/runner_util/targets.bzl b/extension/runner_util/targets.bzl
index 43c0ed08f32..bc0fee197d6 100644
--- a/extension/runner_util/targets.bzl
+++ b/extension/runner_util/targets.bzl
@@ -26,18 +26,3 @@ def define_common_targets():
                 "//executorch/runtime/executor:program" + aten_suffix,
             ],
         )
-
-        runtime.cxx_library(
-            name = "managed_tensor" + aten_suffix,
-            exported_headers = [
-                "managed_tensor.h",
-            ],
-            visibility = [
-                "//executorch/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            deps = [
-                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
-                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
-            ],
-        )
diff --git a/extension/runner_util/test/CMakeLists.txt b/extension/runner_util/test/CMakeLists.txt
index 6b295611fd2..aefb3b0417a 100644
--- a/extension/runner_util/test/CMakeLists.txt
+++ b/extension/runner_util/test/CMakeLists.txt
@@ -23,7 +23,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
 include(${EXECUTORCH_ROOT}/build/Test.cmake)
 
-set(_test_srcs inputs_test.cpp managed_tensor_test.cpp)
+set(_test_srcs inputs_test.cpp)
 
 et_cxx_test(
   extension_runner_util_test
diff --git a/extension/runner_util/test/inputs_test.cpp b/extension/runner_util/test/inputs_test.cpp
index 94e99bcc098..c916da488e5 100644
--- a/extension/runner_util/test/inputs_test.cpp
+++ b/extension/runner_util/test/inputs_test.cpp
@@ -20,20 +20,19 @@
 using namespace ::testing;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using torch::executor::Error;
-using torch::executor::EValue;
-using torch::executor::MemoryAllocator;
-using torch::executor::MemoryManager;
-using torch::executor::Method;
-using torch::executor::Program;
-using torch::executor::Result;
-using torch::executor::Span;
-using torch::executor::Tag;
-using torch::executor::Tensor;
-using torch::executor::testing::ManagedMemoryManager;
-using torch::executor::util::BufferCleanup;
-using torch::executor::util::FileDataLoader;
-using torch::executor::util::prepare_input_tensors;
+using executorch::extension::BufferCleanup;
+using executorch::extension::FileDataLoader;
+using executorch::extension::prepare_input_tensors;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::Tag;
+using executorch::runtime::testing::ManagedMemoryManager;
 
 class InputsTest : public ::testing::Test {
  protected:
diff --git a/extension/runner_util/test/managed_tensor_test.cpp b/extension/runner_util/test/managed_tensor_test.cpp
deleted file mode 100644
index b511cdbcf17..00000000000
--- a/extension/runner_util/test/managed_tensor_test.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/extension/runner_util/managed_tensor.h>
-
-#include <executorch/runtime/platform/runtime.h>
-
-#include <gtest/gtest.h>
-
-using namespace ::testing;
-using exec_aten::DimOrderType;
-using exec_aten::ScalarType;
-using exec_aten::SizesType;
-using exec_aten::StridesType;
-using torch::executor::ArrayRef;
-using torch::executor::ManagedTensor;
-
-class ManagedTensorTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    torch::executor::runtime_init();
-
-    data_ = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-    sizes_ = {2, 3, 4};
-    expected_strides_ = {12, 4, 1};
-    managed_tensor_ =
-        std::make_unique<ManagedTensor>(data_.data(), sizes_, ScalarType::Long);
-  }
-
- protected:
-  std::vector<int64_t> data_;
-  std::vector<SizesType> sizes_;
-  std::vector<int> expected_strides_;
-  std::unique_ptr<ManagedTensor> managed_tensor_;
-};
-
-TEST_F(ManagedTensorTest, Smoke) {
-  const auto tensor = managed_tensor_->get_aliasing_tensor();
-
-  EXPECT_EQ(tensor.sizes(), ArrayRef<SizesType>(sizes_.data(), sizes_.size()));
-  EXPECT_EQ(tensor.scalar_type(), ScalarType::Long);
-  EXPECT_EQ(tensor.const_data_ptr(), data_.data());
-  for (size_t i = 0; i < expected_strides_.size(); ++i) {
-    EXPECT_EQ(tensor.strides()[i], expected_strides_[i]);
-  }
-}
-
-TEST_F(ManagedTensorTest, ResizeWithUpdatedRank) {
-  // gtest death test doesn't work on iOS:
-  // https://github.com/google/googletest/issues/2834
-#if !GTEST_OS_IOS
-  EXPECT_EXIT(
-      managed_tensor_->resize(std::vector<SizesType>{2, 3, 4, 5}),
-      ::testing::KilledBySignal(SIGABRT),
-      "");
-#endif
-}
-
-TEST_F(ManagedTensorTest, ResizeShrink) {
-  managed_tensor_->resize(std::vector<SizesType>{2, 2, 2});
-  const auto tensor = managed_tensor_->get_aliasing_tensor();
-
-  std::vector<SizesType> expected_sizes = {2, 2, 2};
-  EXPECT_EQ(
-      tensor.sizes(),
-      ArrayRef<SizesType>(expected_sizes.data(), expected_sizes.size()));
-  EXPECT_EQ(tensor.scalar_type(), ScalarType::Long);
-  EXPECT_EQ(tensor.const_data_ptr(), data_.data());
-}
-
-TEST_F(ManagedTensorTest, Resize) {
-  managed_tensor_->resize(std::vector<SizesType>{4, 3, 2});
-  const auto tensor = managed_tensor_->get_aliasing_tensor();
-
-  std::vector<SizesType> expected_sizes = {4, 3, 2};
-  EXPECT_EQ(
-      tensor.sizes(),
-      ArrayRef<SizesType>(expected_sizes.data(), expected_sizes.size()));
-  EXPECT_EQ(tensor.scalar_type(), ScalarType::Long);
-  EXPECT_EQ(tensor.const_data_ptr(), data_.data());
-}
diff --git a/extension/runner_util/test/targets.bzl b/extension/runner_util/test/targets.bzl
index 7c042ca9d94..f55a1ea995f 100644
--- a/extension/runner_util/test/targets.bzl
+++ b/extension/runner_util/test/targets.bzl
@@ -30,15 +30,3 @@ def define_common_targets(is_fbcode = False):
                     "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
                 },
             )
-
-    runtime.cxx_test(
-        name = "managed_tensor_test",
-        srcs = [
-            "managed_tensor_test.cpp",
-        ],
-        deps = [
-            "//executorch/extension/runner_util:managed_tensor",
-            "//executorch/runtime/core/exec_aten:lib",
-            "//executorch/runtime/core/exec_aten/util:tensor_util",
-        ],
-    )
diff --git a/extension/tensor/CMakeLists.txt b/extension/tensor/CMakeLists.txt
new file mode 100644
index 00000000000..2cf1bf2956f
--- /dev/null
+++ b/extension/tensor/CMakeLists.txt
@@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+list(TRANSFORM _extension_tensor__srcs PREPEND "${EXECUTORCH_ROOT}/")
+add_library(extension_tensor ${_extension_tensor__srcs})
+target_link_libraries(extension_tensor executorch_no_prim_ops)
+target_include_directories(extension_tensor PUBLIC ${EXECUTORCH_ROOT}/..)
+target_compile_options(extension_tensor PUBLIC ${_common_compile_options})
+
+# Install libraries
+install(
+  TARGETS extension_tensor
+  DESTINATION lib
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
diff --git a/backends/xnnpack/threadpool/test/TARGETS b/extension/tensor/TARGETS
similarity index 100%
rename from backends/xnnpack/threadpool/test/TARGETS
rename to extension/tensor/TARGETS
diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl
new file mode 100644
index 00000000000..8493d093fa1
--- /dev/null
+++ b/extension/tensor/targets.bzl
@@ -0,0 +1,37 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    for aten_mode in (True, False):
+        aten_suffix = ("_aten" if aten_mode else "")
+
+        runtime.cxx_library(
+            name = "tensor" + aten_suffix,
+            srcs = [
+                "tensor_impl_ptr.cpp",
+                "tensor_ptr.cpp",
+                "tensor_ptr_maker.cpp",
+            ],
+            exported_headers = [
+                "tensor.h",
+                "tensor_impl_ptr.h",
+                "tensor_ptr.h",
+                "tensor_ptr_maker.h",
+            ],
+            visibility = [
+                "@EXECUTORCH_CLIENTS",
+            ],
+            deps = [
+                "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix,
+                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
+            ],
+            exported_deps = [
+                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix,
+            ],
+        )
diff --git a/extension/tensor/tensor.h b/extension/tensor/tensor.h
new file mode 100644
index 00000000000..0de8c39b75d
--- /dev/null
+++ b/extension/tensor/tensor.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// Umbrella header for the Tensor extension.
+#include <executorch/extension/tensor/tensor_impl_ptr.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
diff --git a/extension/tensor/tensor_impl_ptr.cpp b/extension/tensor/tensor_impl_ptr.cpp
new file mode 100644
index 00000000000..358acfd1850
--- /dev/null
+++ b/extension/tensor/tensor_impl_ptr.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/tensor/tensor_impl_ptr.h>
+
+#include <algorithm>
+#include <numeric>
+
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+
+namespace executorch {
+namespace extension {
+namespace {
+#ifndef USE_ATEN_LIB
+// No-op deleter that does nothing when called.
+static void noop_deleter(void*) {}
+
+/**
+ * Custom deleter for TensorImplPtr that ensures the memory associated with
+ * dynamic metadata (sizes, dim_order, and strides) is properly managed when the
+ * TensorImpl is destroyed.
+ *
+ * Since TensorImpl does not own the metadata arrays (sizes, dim_order,
+ * strides), this deleter is responsible for releasing that memory when the
+ * TensorImpl is destroyed.
+ */
+struct TensorImplPtrDeleter final {
+  // A custom deleter of the std::shared_ptr is required to be copyable until
+  // C++20, so any data it holds must be copyable too. Hence, we use shared_ptr
+  // to hold the data and metadata to avoid unnecessary copies.
+  std::shared_ptr<void> data;
+  std::shared_ptr<std::vector<exec_aten::SizesType>> sizes;
+  std::shared_ptr<std::vector<exec_aten::DimOrderType>> dim_order;
+  std::shared_ptr<std::vector<exec_aten::StridesType>> strides;
+
+  void operator()(exec_aten::TensorImpl* pointer) {
+    // Release all resources immediately since the data held by the
+    // TensorImplPtrDeleter is tied to the managed object, not the smart pointer
+    // itself. We need to free this memory when the object is destroyed, not
+    // when the smart pointer (and deleter) are eventually destroyed or reset.
+    data.reset();
+    sizes.reset();
+    dim_order.reset();
+    strides.reset();
+    delete pointer;
+  }
+};
+#endif // USE_ATEN_LIB
+} // namespace
+
+TensorImplPtr make_tensor_impl_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    void* data,
+    std::vector<exec_aten::DimOrderType> dim_order,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism,
+    std::function<void(void*)> deleter) {
+  const auto dim = sizes.size();
+  ET_CHECK_MSG(
+      dim_order.empty() || dim_order.size() == dim,
+      "dim_order size must match sizes or be empty.");
+  ET_CHECK_MSG(
+      strides.empty() || strides.size() == dim,
+      "strides size must match sizes or be empty.");
+
+  if (dim_order.empty()) {
+    dim_order.resize(dim);
+    std::iota(dim_order.begin(), dim_order.end(), 0);
+    if (!strides.empty()) {
+      std::sort(dim_order.begin(), dim_order.end(), [&](size_t a, size_t b) {
+        return strides[a] > strides[b];
+      });
+    }
+  }
+  std::vector<exec_aten::StridesType> computed_strides(dim);
+  auto error = runtime::dim_order_to_stride(
+      sizes.data(), dim_order.data(), dim, computed_strides.data());
+  ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides.");
+
+  if (!strides.empty()) {
+    ET_CHECK_MSG(computed_strides == strides, "Invalid strides provided.");
+  } else {
+    strides = std::move(computed_strides);
+  }
+#ifndef USE_ATEN_LIB
+  auto tensor_impl = std::make_unique<exec_aten::TensorImpl>(
+      type,
+      dim,
+      sizes.data(),
+      data,
+      dim_order.data(),
+      strides.data(),
+      dim > 0 ? dynamism : exec_aten::TensorShapeDynamism::STATIC);
+  return TensorImplPtr(
+      tensor_impl.release(),
+      TensorImplPtrDeleter{
+          std::shared_ptr<void>(
+              data, deleter ? std::move(deleter) : noop_deleter),
+          std::make_shared<std::vector<exec_aten::SizesType>>(std::move(sizes)),
+          std::make_shared<std::vector<exec_aten::DimOrderType>>(
+              std::move(dim_order)),
+          std::make_shared<std::vector<exec_aten::StridesType>>(
+              std::move(strides))});
+#else
+  auto options = c10::TensorOptions()
+                     .dtype(c10::scalarTypeToTypeMeta(type))
+                     .device(c10::kCPU);
+  auto storage = c10::Storage(
+      c10::Storage::use_byte_size_t(),
+      at::detail::computeStorageNbytes(
+          sizes, strides, options.dtype().itemsize()),
+      c10::InefficientStdFunctionContext::makeDataPtr(
+          data, std::move(deleter), options.device()),
+      nullptr,
+      false);
+  auto tensor_impl = c10::make_intrusive<at::TensorImpl>(
+      std::move(storage),
+      c10::DispatchKeySet(c10::DispatchKey::CPU),
+      options.dtype());
+  tensor_impl->set_sizes_and_strides(sizes, strides);
+  return tensor_impl;
+#endif // USE_ATEN_LIB
+}
+
+TensorImplPtr make_tensor_impl_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<uint8_t> data,
+    std::vector<exec_aten::DimOrderType> dim_order,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism) {
+  ET_CHECK_MSG(
+      data.size() >= exec_aten::compute_numel(sizes.data(), sizes.size()) *
+              exec_aten::elementSize(type),
+      "Data size is smaller than required by sizes and scalar type.");
+  auto raw_data_ptr = data.data();
+  auto data_ptr = std::make_shared<std::vector<uint8_t>>(std::move(data));
+  return make_tensor_impl_ptr(
+      std::move(sizes),
+      raw_data_ptr,
+      std::move(dim_order),
+      std::move(strides),
+      type,
+      dynamism,
+      [data_ptr = std::move(data_ptr)](void*) {});
+}
+
+} // namespace extension
+} // namespace executorch
diff --git a/extension/tensor/tensor_impl_ptr.h b/extension/tensor/tensor_impl_ptr.h
new file mode 100644
index 00000000000..89fc7ff1ebf
--- /dev/null
+++ b/extension/tensor/tensor_impl_ptr.h
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/compiler.h>
+
+namespace executorch {
+namespace extension {
+
+#ifndef USE_ATEN_LIB
+/**
+ * A smart pointer for managing the lifecycle of a TensorImpl.
+ *
+ * TensorImplPtr uses a shared pointer since multiple Tensor objects may
+ * share the same underlying data and metadata. This shared ownership ensures
+ * that the TensorImpl is destroyed only when all references to it are gone,
+ * providing a safe and efficient way to manage shared tensor implementations.
+ * It serves as a safer, more convenient alternative to the original TensorImpl,
+ * which does not manage its metadata by design.
+ */
+using TensorImplPtr = std::shared_ptr<exec_aten::TensorImpl>;
+#else
+/**
+ * A smart pointer type for managing the lifecycle of a TensorImpl.
+ *
+ * TensorImplPtr uses an intrusive pointer when working with ATen, ensuring
+ * efficient reference counting and shared ownership of the underlying data and
+ * metadata.
+ */
+using TensorImplPtr =
+    c10::intrusive_ptr<exec_aten::TensorImpl, at::UndefinedTensorImpl>;
+#endif // USE_ATEN_LIB
+
+/**
+ * Creates a TensorImplPtr that manages a newly created TensorImpl with the
+ * specified properties.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A pointer to the data buffer.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param deleter A custom deleter function for managing the lifetime of the
+ * data buffer. If provided, this deleter is called when the managed TensorImpl
+ * is destroyed.
+ * @return A TensorImplPtr managing the newly created TensorImpl.
+ */
+TensorImplPtr make_tensor_impl_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    void* data,
+    std::vector<exec_aten::DimOrderType> dim_order,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    std::function<void(void*)> deleter = nullptr);
+
+/**
+ * Creates a TensorImplPtr that manages a newly created TensorImpl with the
+ * specified properties.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A pointer to the data buffer.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param deleter A custom deleter function for managing the lifetime of the
+ * data buffer. If provided, this deleter is called when the managed TensorImpl
+ * is destroyed.
+ * @return A TensorImplPtr managing the newly created TensorImpl.
+ */
+inline TensorImplPtr make_tensor_impl_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    void* data,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    std::function<void(void*)> deleter = nullptr) {
+  return make_tensor_impl_ptr(
+      std::move(sizes), data, {}, {}, type, dynamism, std::move(deleter));
+}
+
+/**
+ * Creates a TensorImplPtr that manages a newly created TensorImpl with the
+ * specified properties.
+ *
+ * This template overload is specialized for cases where tensor data is provided
+ * as a vector. If the specified `type` differs from the deduced type of the
+ * vector's elements, and casting is allowed, the data will be cast to the
+ * specified `type`. This allows for flexible creation of tensors with data
+ * vectors of one type and a different scalar type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the vector.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A vector containing the tensor's data.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of each dimension.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorImplPtr that manages the newly created TensorImpl.
+ */
+template <
+    typename T = float,
+    exec_aten::ScalarType deduced_type = runtime::CppTypeToScalarType<T>::value>
+TensorImplPtr make_tensor_impl_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<T> data,
+    std::vector<exec_aten::DimOrderType> dim_order = {},
+    std::vector<exec_aten::StridesType> strides = {},
+    exec_aten::ScalarType type = deduced_type,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type != deduced_type) {
+    ET_CHECK_MSG(
+        runtime::canCast(deduced_type, type),
+        "Cannot cast deduced type to specified type.");
+    std::vector<uint8_t> casted_data(data.size() * runtime::elementSize(type));
+    ET_SWITCH_REALHBBF16_TYPES(
+        type, nullptr, "make_tensor_impl_ptr", CTYPE, [&] {
+          std::transform(
+              data.begin(),
+              data.end(),
+              reinterpret_cast<CTYPE*>(casted_data.data()),
+              [](const T& val) { return static_cast<CTYPE>(val); });
+        });
+    const auto raw_data_ptr = casted_data.data();
+    auto data_ptr =
+        std::make_shared<std::vector<uint8_t>>(std::move(casted_data));
+    return make_tensor_impl_ptr(
+        std::move(sizes),
+        raw_data_ptr,
+        std::move(dim_order),
+        std::move(strides),
+        type,
+        dynamism,
+        [data_ptr = std::move(data_ptr)](void*) {});
+  }
+  const auto raw_data_ptr = data.data();
+  auto data_ptr = std::make_shared<std::vector<T>>(std::move(data));
+  return make_tensor_impl_ptr(
+      std::move(sizes),
+      raw_data_ptr,
+      std::move(dim_order),
+      std::move(strides),
+      type,
+      dynamism,
+      [data_ptr = std::move(data_ptr)](void*) {});
+}
+
+/**
+ * Creates a TensorImplPtr that manages a newly created TensorImpl with the
+ * specified properties.
+ *
+ * This template overload is specialized for cases where tensor data is provided
+ * as a vector. If the specified `type` differs from the deduced type of the
+ * vector's elements, and casting is allowed, the data will be cast to the
+ * specified `type`. This allows for flexible creation of tensors with data
+ * vectors of one type and a different scalar type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the vector.
+ * @param data A vector containing the tensor's data.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorImplPtr that manages the newly created TensorImpl.
+ */
+template <
+    typename T = float,
+    exec_aten::ScalarType deduced_type = runtime::CppTypeToScalarType<T>::value>
+inline TensorImplPtr make_tensor_impl_ptr(
+    std::vector<T> data,
+    exec_aten::ScalarType type = deduced_type,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  std::vector<exec_aten::SizesType> sizes{exec_aten::SizesType(data.size())};
+  return make_tensor_impl_ptr(
+      std::move(sizes), std::move(data), {0}, {1}, type, dynamism);
+}
+
+/**
+ * Creates a TensorImplPtr that manages a newly created TensorImpl with the
+ * specified properties.
+ *
+ * This template overload is specialized for cases where tensor data is provided
+ * as an initializer list. If the specified `type` differs from the deduced type
+ * of the initializer list's elements, and casting is allowed, the data will be
+ * cast to the specified `type`. This allows for flexible creation of tensors
+ * with data initializer list of one type and a different scalar type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the initializer
+ * list.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param list An initializer list containing the tensor's data.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of each dimension.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorImplPtr that manages the newly created TensorImpl.
+ */
+template <
+    typename T = float,
+    exec_aten::ScalarType deduced_type = runtime::CppTypeToScalarType<T>::value>
+inline TensorImplPtr make_tensor_impl_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    std::initializer_list<T> list,
+    std::vector<exec_aten::DimOrderType> dim_order = {},
+    std::vector<exec_aten::StridesType> strides = {},
+    exec_aten::ScalarType type = deduced_type,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return make_tensor_impl_ptr(
+      std::move(sizes),
+      std::vector<T>(std::move(list)),
+      std::move(dim_order),
+      std::move(strides),
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorImplPtr that manages a newly created TensorImpl with the
+ * specified properties.
+ *
+ * This template overload is specialized for cases where tensor data is provided
+ * as an initializer list. If the specified `type` differs from the deduced type
+ * of the initializer list's elements, and casting is allowed, the data will be
+ * cast to the specified `type`. This allows for flexible creation of tensors
+ * with data initializer list of one type and a different scalar type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the initializer
+ * list.
+ * @param list An initializer list containing the tensor's data.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorImplPtr that manages the newly created TensorImpl.
+ */
+template <
+    typename T = float,
+    exec_aten::ScalarType deduced_type = runtime::CppTypeToScalarType<T>::value>
+inline TensorImplPtr make_tensor_impl_ptr(
+    std::initializer_list<T> list,
+    exec_aten::ScalarType type = deduced_type,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  std::vector<exec_aten::SizesType> sizes{exec_aten::SizesType(list.size())};
+  return make_tensor_impl_ptr(
+      std::move(sizes), std::move(list), {0}, {1}, type, dynamism);
+}
+
+/**
+ * Creates a TensorImplPtr to manage a Tensor with a single scalar value.
+ *
+ * @tparam T The C++ type of the scalar value.
+ * @param value The scalar value used for the Tensor.
+ * @return A TensorImplPtr managing the newly created TensorImpl.
+ */
+template <typename T>
+inline TensorImplPtr make_tensor_impl_ptr(T value) {
+  return make_tensor_impl_ptr({}, std::vector<T>{value});
+}
+
+/**
+ * Creates a TensorImplPtr that manages a newly created TensorImpl with the
+ * specified properties.
+ *
+ * This overload accepts a raw memory buffer stored in a std::vector<uint8_t>
+ * and a scalar type to interpret the data. The vector is managed, and its
+ * lifetime is tied to the TensorImpl.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A vector containing the raw memory buffer for the tensor's data.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorImplPtr managing the newly created TensorImpl.
+ */
+TensorImplPtr make_tensor_impl_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<uint8_t> data,
+    std::vector<exec_aten::DimOrderType> dim_order,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorImplPtr that manages a newly created TensorImpl with the
+ * specified properties.
+ *
+ * This overload accepts a raw memory buffer stored in a std::vector<uint8_t>
+ * and a scalar type to interpret the data. The vector is managed, and the
+ * memory's lifetime is tied to the TensorImpl.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A vector containing the raw memory for the tensor's data.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorImplPtr managing the newly created TensorImpl.
+ */
+inline TensorImplPtr make_tensor_impl_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<uint8_t> data,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return make_tensor_impl_ptr(
+      std::move(sizes), std::move(data), {}, {}, type, dynamism);
+}
+
+} // namespace extension
+} // namespace executorch
diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp
new file mode 100644
index 00000000000..647de8dbe57
--- /dev/null
+++ b/extension/tensor/tensor_ptr.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/tensor/tensor_ptr.h>
+
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+namespace executorch {
+namespace extension {
+
+TensorPtr clone_tensor_ptr(const exec_aten::Tensor& tensor) {
+  std::vector<exec_aten::SizesType> sizes(
+      tensor.sizes().begin(), tensor.sizes().end());
+  std::vector<exec_aten::DimOrderType> dim_order{
+#ifndef USE_ATEN_LIB
+      tensor.dim_order().begin(), tensor.dim_order().end()
+#endif // USE_ATEN_LIB
+  };
+  std::vector<exec_aten::StridesType> strides(
+      tensor.strides().begin(), tensor.strides().end());
+  auto dynamism = exec_aten::TensorShapeDynamism::DYNAMIC_BOUND;
+#ifndef USE_ATEN_LIB
+  dynamism = tensor.shape_dynamism();
+#endif // USE_ATEN_LIB
+  return tensor.const_data_ptr()
+      ? make_tensor_ptr(
+            std::move(sizes),
+            std::vector<uint8_t>(
+                (uint8_t*)tensor.const_data_ptr(),
+                (uint8_t*)tensor.const_data_ptr() + tensor.nbytes()),
+            std::move(dim_order),
+            std::move(strides),
+            tensor.scalar_type(),
+            dynamism)
+      : make_tensor_ptr(
+            std::move(sizes),
+            nullptr,
+            std::move(dim_order),
+            std::move(strides),
+            tensor.scalar_type(),
+            dynamism);
+}
+
+runtime::Error resize_tensor_ptr(
+    TensorPtr& tensor,
+    const std::vector<exec_aten::SizesType>& sizes) {
+  return runtime::resize_tensor(
+      *tensor,
+      exec_aten::ArrayRef<exec_aten::SizesType>(sizes.data(), sizes.size()));
+}
+
+} // namespace extension
+} // namespace executorch
diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h
new file mode 100644
index 00000000000..1ec73882573
--- /dev/null
+++ b/extension/tensor/tensor_ptr.h
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/tensor/tensor_impl_ptr.h>
+#include <executorch/runtime/core/error.h>
+
+namespace executorch {
+namespace extension {
+
+#ifndef USE_ATEN_LIB
+namespace internal {
+/**
+ * Custom deleter for TensorPtr that ensures proper management of the associated
+ * TensorImplPtr.
+ *
+ * Since Tensor does not own its TensorImpl, this deleter manages the
+ * TensorImplPtr lifecycle, ensuring dynamic metadata (sizes, dim_order,
+ * strides) is released appropriately when the Tensor is destroyed.
+ */
+struct TensorPtrDeleter final {
+  TensorImplPtr tensor_impl;
+
+  void operator()(exec_aten::Tensor* pointer) {
+    // Release all resources immediately since the data held by the
+    // TensorPtrDeleter is tied to the managed object, not the smart pointer
+    // itself. We need to free this memory when the object is destroyed, not
+    // when the smart pointer (and deleter) are eventually destroyed or reset.
+    tensor_impl.reset();
+    delete pointer;
+  }
+};
+} // namespace internal
+
+/**
+ * A smart pointer for managing the lifecycle of a Tensor.
+ *
+ * TensorPtr uses a unique pointer to ensure each Tensor object has distinct
+ * ownership. This abstraction simplifies memory management and serves as a
+ * safer alternative to the standard Tensor, which does not manage its metadata
+ * by design. It ensures that the underlying TensorImpl can be safely shared
+ * among tensors as needed.
+ */
+using TensorPtr =
+    std::unique_ptr<exec_aten::Tensor, internal::TensorPtrDeleter>;
+#else
+/**
+ * A smart pointer type for managing the lifecycle of a Tensor.
+ *
+ * When using ATen, this is a standard unique_ptr for exec_aten::Tensor.
+ * In ATen, the Tensor class owns its TensorImpl and associated metadata,
+ * so no custom deleter is required.
+ */
+using TensorPtr = std::unique_ptr<exec_aten::Tensor>;
+#endif // USE_ATEN_LIB
+
+/**
+ * Creates a TensorPtr to manage a newly created Tensor with the given
+ * TensorImplPtr.
+ *
+ * This function wraps the provided TensorImplPtr in a TensorPtr, ensuring the
+ * Tensor object’s lifecycle is managed correctly. The TensorPtr uniquely owns
+ * the Tensor object, while the underlying TensorImplPtr can be shared with
+ * other Tensors.
+ *
+ * @param tensor_impl A TensorImplPtr to the TensorImpl to be managed.
+ * @return A TensorPtr that manages the newly created Tensor.
+ */
+inline TensorPtr make_tensor_ptr(TensorImplPtr tensor_impl) {
+#ifndef USE_ATEN_LIB
+  auto tensor = std::make_unique<exec_aten::Tensor>(tensor_impl.get());
+  return TensorPtr(
+      tensor.release(), internal::TensorPtrDeleter{std::move(tensor_impl)});
+#else
+  return std::make_unique<exec_aten::Tensor>(std::move(tensor_impl));
+#endif // USE_ATEN_LIB
+}
+
+/**
+ * Creates a TensorPtr that shares the same TensorImplPtr as an existing
+ * TensorPtr.
+ *
+ * This function returns a TensorPtr that shares the underlying TensorImpl
+ * with the provided TensorPtr, ensuring that the underlying data and metadata
+ * are shared safely without duplication between the tensor objects.
+ *
+ * @param tensor A TensorPtr to the existing Tensor from which to create a copy.
+ * @return A new TensorPtr that shares the underlying TensorImplPtr with the
+ * original.
+ */
+inline TensorPtr make_tensor_ptr(const TensorPtr& tensor) {
+#ifndef USE_ATEN_LIB
+  return make_tensor_ptr(tensor.get_deleter().tensor_impl);
+#else
+  return make_tensor_ptr(tensor->getIntrusivePtr());
+#endif // USE_ATEN_LIB
+}
+
+/**
+ * Creates a TensorPtr to manage a new Tensor with the same properties
+ * as the given Tensor, sharing the same data without owning it.
+ *
+ * @param tensor The Tensor whose properties are used to create a new TensorPtr.
+ * @return A new TensorPtr managing a Tensor with the same properties as the
+ * original.
+ */
+inline TensorPtr make_tensor_ptr(const exec_aten::Tensor& tensor) {
+  return make_tensor_ptr(make_tensor_impl_ptr(
+      std::vector<exec_aten::SizesType>(
+          tensor.sizes().begin(), tensor.sizes().end()),
+      tensor.mutable_data_ptr(),
+#ifndef USE_ATEN_LIB
+      std::vector<exec_aten::DimOrderType>(
+          tensor.dim_order().begin(), tensor.dim_order().end()),
+      std::vector<exec_aten::StridesType>(
+          tensor.strides().begin(), tensor.strides().end()),
+      tensor.scalar_type(),
+      tensor.shape_dynamism()
+#else // USE_ATEN_LIB
+      {},
+      std::vector<exec_aten::StridesType>(
+          tensor.strides().begin(), tensor.strides().end()),
+      tensor.scalar_type()
+#endif // USE_ATEN_LIB
+          ));
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A pointer to the data buffer.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of the tensor.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param deleter A custom deleter function for managing the lifetime of the
+ * data buffer. If provided, this deleter will be called when the managed Tensor
+ * object is destroyed.
+ * @return A TensorPtr that manages the newly created Tensor.
+ */
+inline TensorPtr make_tensor_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    void* data,
+    std::vector<exec_aten::DimOrderType> dim_order,
+    std::vector<exec_aten::StridesType> strides,
+    const exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    const exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    std::function<void(void*)> deleter = nullptr) {
+  return make_tensor_ptr(make_tensor_impl_ptr(
+      std::move(sizes),
+      data,
+      std::move(dim_order),
+      std::move(strides),
+      type,
+      dynamism,
+      std::move(deleter)));
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A pointer to the data buffer.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param deleter A custom deleter function for managing the lifetime of the
+ * data buffer. If provided, this deleter will be called when the managed Tensor
+ * object is destroyed.
+ * @return A TensorPtr that manages the newly created Tensor.
+ */
+inline TensorPtr make_tensor_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    void* data,
+    const exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    const exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    std::function<void(void*)> deleter = nullptr) {
+  return make_tensor_ptr(make_tensor_impl_ptr(
+      std::move(sizes), data, {}, {}, type, dynamism, std::move(deleter)));
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This template overload is specialized for cases where the tensor data is
+ * provided as a vector. The scalar type is automatically deduced from the
+ * vector's data type. If the specified `type` differs from the deduced type of
+ * the vector's elements, and casting is allowed, the data will be cast to the
+ * specified `type`. This allows for flexible creation of tensors with data
+ * vectors of one type and a different scalar type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the vector.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A vector containing the tensor's data.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of each dimension.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <
+    typename T = float,
+    exec_aten::ScalarType deduced_type = runtime::CppTypeToScalarType<T>::value>
+inline TensorPtr make_tensor_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<T> data,
+    std::vector<exec_aten::DimOrderType> dim_order = {},
+    std::vector<exec_aten::StridesType> strides = {},
+    exec_aten::ScalarType type = deduced_type,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return make_tensor_ptr(make_tensor_impl_ptr(
+      std::move(sizes),
+      std::move(data),
+      std::move(dim_order),
+      std::move(strides),
+      type,
+      dynamism));
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This template overload is specialized for cases where the tensor data is
+ * provided as a vector. The scalar type is automatically deduced from the
+ * vector's data type. If the specified `type` differs from the deduced type of
+ * the vector's elements, and casting is allowed, the data will be cast to the
+ * specified `type`. This allows for flexible creation of tensors with data
+ * vectors of one type and a different scalar type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the vector.
+ * @param data A vector containing the tensor's data.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <
+    typename T = float,
+    exec_aten::ScalarType deduced_type = runtime::CppTypeToScalarType<T>::value>
+inline TensorPtr make_tensor_ptr(
+    std::vector<T> data,
+    exec_aten::ScalarType type = deduced_type,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return make_tensor_ptr(make_tensor_impl_ptr(std::move(data), type, dynamism));
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This template overload is specialized for cases where the tensor data is
+ * provided as an initializer list. The scalar type is automatically deduced
+ * from the initializer list's data type. If the specified `type` differs from
+ * the deduced type of the initializer list's elements, and casting is allowed,
+ * the data will be cast to the specified `type`. This allows for flexible
+ * creation of tensors with data vectors of one type and a different scalar
+ * type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the initializer
+ * list.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param list An initializer list containing the tensor's data.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of each dimension.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <
+    typename T = float,
+    exec_aten::ScalarType deduced_type = runtime::CppTypeToScalarType<T>::value>
+inline TensorPtr make_tensor_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    std::initializer_list<T> list,
+    std::vector<exec_aten::DimOrderType> dim_order = {},
+    std::vector<exec_aten::StridesType> strides = {},
+    exec_aten::ScalarType type = deduced_type,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return make_tensor_ptr(make_tensor_impl_ptr(
+      std::move(sizes),
+      std::move(list),
+      std::move(dim_order),
+      std::move(strides),
+      type,
+      dynamism));
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This template overload allows creating a Tensor from an initializer list
+ * of data. The scalar type is automatically deduced from the type of the
+ * initializer list's elements. If the specified `type` differs from
+ * the deduced type of the initializer list's elements, and casting is allowed,
+ * the data will be cast to the specified `type`. This allows for flexible
+ * creation of tensors with data vectors of one type and a different scalar
+ * type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the initializer
+ * list.
+ * @param list An initializer list containing the tensor's data.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <
+    typename T = float,
+    exec_aten::ScalarType deduced_type = runtime::CppTypeToScalarType<T>::value>
+inline TensorPtr make_tensor_ptr(
+    std::initializer_list<T> list,
+    exec_aten::ScalarType type = deduced_type,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return make_tensor_ptr(make_tensor_impl_ptr(std::move(list), type, dynamism));
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with a single scalar value.
+ *
+ * @tparam T The C++ type of the scalar value.
+ * @param value The scalar value to be used for the Tensor.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <typename T>
+inline TensorPtr make_tensor_ptr(T value) {
+  return make_tensor_ptr(make_tensor_impl_ptr(value));
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This overload accepts a raw memory buffer stored in a std::vector<uint8_t>
+ * and a scalar type to interpret the data. The vector is managed, and the
+ * memory's lifetime is tied to the TensorImpl.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A vector containing the raw memory for the tensor's data.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr managing the newly created Tensor.
+ */
+inline TensorPtr make_tensor_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<uint8_t> data,
+    std::vector<exec_aten::DimOrderType> dim_order,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return make_tensor_ptr(make_tensor_impl_ptr(
+      std::move(sizes),
+      std::move(data),
+      std::move(dim_order),
+      std::move(strides),
+      type,
+      dynamism));
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This overload accepts a raw memory buffer stored in a std::vector<uint8_t>
+ * and a scalar type to interpret the data. The vector is managed, and the
+ * memory's lifetime is tied to the TensorImpl.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A vector containing the raw memory for the tensor's data.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr managing the newly created Tensor.
+ */
+inline TensorPtr make_tensor_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<uint8_t> data,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return make_tensor_ptr(
+      make_tensor_impl_ptr(std::move(sizes), std::move(data), type, dynamism));
+}
+
+/**
+ * Creates a TensorPtr that manages a new Tensor with the same properties
+ * as the given Tensor, but with a copy of the data owned by the returned
+ * TensorPtr, or nullptr if the original data is null.
+ *
+ * @param tensor The Tensor to clone.
+ * @return A new TensorPtr that manages a Tensor with the same properties as the
+ * original but with copied data.
+ */
+TensorPtr clone_tensor_ptr(const exec_aten::Tensor& tensor);
+
+/**
+ * Creates a new TensorPtr by cloning the given TensorPtr, copying the
+ * underlying data.
+ *
+ * @param tensor The TensorPtr to clone.
+ * @return A new TensorPtr that manages a Tensor with the same properties as the
+ * original but with copied data.
+ */
+inline TensorPtr clone_tensor_ptr(const TensorPtr& tensor) {
+  return clone_tensor_ptr(*tensor);
+}
+
+/**
+ * Resizes the Tensor managed by the provided TensorPtr to the new sizes.
+ *
+ * @param tensor A TensorPtr managing the Tensor to resize.
+ * @param sizes A vector representing the new sizes for each dimension.
+ * @return Error::Ok on success, or an appropriate error code on failure.
+ */
+ET_NODISCARD
+runtime::Error resize_tensor_ptr(
+    TensorPtr& tensor,
+    const std::vector<exec_aten::SizesType>& sizes);
+
+} // namespace extension
+} // namespace executorch
diff --git a/extension/tensor/tensor_ptr_maker.cpp b/extension/tensor/tensor_ptr_maker.cpp
new file mode 100644
index 00000000000..cbea6da1e74
--- /dev/null
+++ b/extension/tensor/tensor_ptr_maker.cpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+
+#include <random>
+
+namespace executorch {
+namespace extension {
+namespace {
+
+template <
+    typename INT_T,
+    typename std::enable_if<
+        std::is_integral<INT_T>::value && !std::is_same<INT_T, bool>::value,
+        bool>::type = true>
+bool extract_scalar(exec_aten::Scalar scalar, INT_T* out_val) {
+  if (!scalar.isIntegral(/*includeBool=*/false)) {
+    return false;
+  }
+  int64_t val = scalar.to<int64_t>();
+  if (val < std::numeric_limits<INT_T>::lowest() ||
+      val > std::numeric_limits<INT_T>::max()) {
+    return false;
+  }
+  *out_val = static_cast<INT_T>(val);
+  return true;
+}
+
+template <
+    typename FLOAT_T,
+    typename std::enable_if<std::is_floating_point<FLOAT_T>::value, bool>::
+        type = true>
+bool extract_scalar(exec_aten::Scalar scalar, FLOAT_T* out_val) {
+  double val;
+  if (scalar.isFloatingPoint()) {
+    val = scalar.to<double>();
+    if (std::isfinite(val) &&
+        (val < std::numeric_limits<FLOAT_T>::lowest() ||
+         val > std::numeric_limits<FLOAT_T>::max())) {
+      return false;
+    }
+  } else if (scalar.isIntegral(/*includeBool=*/false)) {
+    val = static_cast<double>(scalar.to<int64_t>());
+  } else {
+    return false;
+  }
+  *out_val = static_cast<FLOAT_T>(val);
+  return true;
+}
+
+template <
+    typename BOOL_T,
+    typename std::enable_if<std::is_same<BOOL_T, bool>::value, bool>::type =
+        true>
+bool extract_scalar(exec_aten::Scalar scalar, BOOL_T* out_val) {
+  if (scalar.isIntegral(false)) {
+    *out_val = static_cast<bool>(scalar.to<int64_t>());
+    return true;
+  }
+  if (scalar.isBoolean()) {
+    *out_val = scalar.to<bool>();
+    return true;
+  }
+  return false;
+}
+
+#define ET_EXTRACT_SCALAR(scalar, out_val) \
+  ET_CHECK_MSG(                            \
+      extract_scalar(scalar, &out_val),    \
+      #scalar " could not be extracted: wrong type or out of range");
+
+template <typename Distribution>
+TensorPtr random_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism,
+    Distribution&& distribution) {
+  auto tensor =
+      empty_strided(std::move(sizes), std::move(strides), type, dynamism);
+  std::default_random_engine gen{std::random_device{}()};
+
+  ET_SWITCH_REALB_TYPES(type, nullptr, "random_strided", CTYPE, [&] {
+    std::generate_n(tensor->mutable_data_ptr<CTYPE>(), tensor->numel(), [&]() {
+      return static_cast<CTYPE>(distribution(gen));
+    });
+  });
+  return tensor;
+}
+
+} // namespace
+
+TensorPtr empty_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism) {
+  std::vector<uint8_t> data(
+      exec_aten::compute_numel(sizes.data(), sizes.size()) *
+      exec_aten::elementSize(type));
+  return make_tensor_ptr(
+      std::move(sizes),
+      std::move(data),
+      {},
+      std::move(strides),
+      type,
+      dynamism);
+}
+
+TensorPtr full_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::Scalar fill_value,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism) {
+  auto tensor =
+      empty_strided(std::move(sizes), std::move(strides), type, dynamism);
+  ET_SWITCH_REALB_TYPES(type, nullptr, "full_strided", CTYPE, [&] {
+    CTYPE value;
+    ET_EXTRACT_SCALAR(fill_value, value);
+    std::fill(
+        tensor->mutable_data_ptr<CTYPE>(),
+        tensor->mutable_data_ptr<CTYPE>() + tensor->numel(),
+        value);
+  });
+  return tensor;
+}
+
+TensorPtr rand_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism) {
+  return random_strided(
+      std::move(sizes),
+      std::move(strides),
+      type,
+      dynamism,
+      std::uniform_real_distribution<float>(0.0f, 1.0f));
+}
+
+TensorPtr randn_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism) {
+  return random_strided(
+      std::move(sizes),
+      std::move(strides),
+      type,
+      dynamism,
+      std::normal_distribution<float>(0.0f, 1.0f));
+}
+
+TensorPtr randint_strided(
+    int64_t low,
+    int64_t high,
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism) {
+  return random_strided(
+      std::move(sizes),
+      std::move(strides),
+      type,
+      dynamism,
+      std::uniform_int_distribution<int64_t>(low, high - 1));
+}
+
+} // namespace extension
+} // namespace executorch
diff --git a/extension/tensor/tensor_ptr_maker.h b/extension/tensor/tensor_ptr_maker.h
new file mode 100644
index 00000000000..3f2d267a4e4
--- /dev/null
+++ b/extension/tensor/tensor_ptr_maker.h
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/tensor/tensor_ptr.h>
+
+namespace executorch {
+namespace extension {
+
+/**
+ * A helper class for creating TensorPtr instances from raw data and tensor
+ * properties. Note that the TensorPtr created by this class does not own the
+ * data, so the data must outlive the TensorPtr.
+ *
+ * TensorPtrMaker provides a fluent interface for specifying various tensor
+ * properties, such as type, sizes, data pointer, dimension order, strides, and
+ * shape dynamism. The final tensor is created by invoking make_tensor_ptr() or
+ * by converting TensorPtrMaker to TensorPtr.
+ */
+class TensorPtrMaker final {
+ public:
+  // This class may have non-copyable members in the future.
+  TensorPtrMaker(const TensorPtrMaker&) = delete;
+  TensorPtrMaker& operator=(const TensorPtrMaker&) = delete;
+  // But it is movable.
+  TensorPtrMaker(TensorPtrMaker&&) = default;
+  TensorPtrMaker& operator=(TensorPtrMaker&&) = default;
+
+  /**
+   * Sets the scalar type of the tensor elements.
+   *
+   * @param type The scalar type (e.g., float, int, bool).
+   * @return Rvalue to this TensorPtrMaker for method chaining.
+   */
+  TensorPtrMaker&& type(exec_aten::ScalarType type) {
+    type_ = type;
+    return std::move(*this);
+  }
+
+  /**
+   * Sets the order of dimensions in memory.
+   *
+   * @param dim_order A vector specifying the dimension order.
+   * @return Rvalue to this TensorPtrMaker for method chaining.
+   */
+  TensorPtrMaker&& dim_order(std::vector<exec_aten::DimOrderType> dim_order) {
+    dim_order_ = std::move(dim_order);
+    return std::move(*this);
+  }
+
+  /**
+   * Sets the strides for each dimension of the tensor.
+   *
+   * @param strides A vector specifying the stride for each dimension.
+   * @return Rvalue to this TensorPtrMaker for method chaining.
+   */
+  TensorPtrMaker&& strides(std::vector<exec_aten::StridesType> strides) {
+    strides_ = std::move(strides);
+    return std::move(*this);
+  }
+
+  /**
+   * Sets the shape dynamism of the tensor.
+   *
+   * @param dynamism Specifies whether the tensor's shape is static, dynamic, or
+   * bounded.
+   * @return Rvalue to this TensorPtrMaker for method chaining.
+   */
+  TensorPtrMaker&& dynamism(exec_aten::TensorShapeDynamism dynamism) {
+    dynamism_ = dynamism;
+    return std::move(*this);
+  }
+
+  /**
+   * Sets a custom deleter function to manage the lifetime of the data buffer.
+   *
+   * @param deleter A function that will be called to delete the data buffer
+   * when the Tensor object managed by the TensorPtr is destroyed. Explicitly
+   * consuming an rvalue to avoid unnecessary copies when the deleter is a
+   * lambda that has captured some state.
+   * @return Rvalue to this TensorPtrMaker for method chaining.
+   */
+  TensorPtrMaker&& deleter(std::function<void(void*)>&& deleter) {
+    deleter_ = std::move(deleter);
+    return std::move(*this);
+  }
+
+  /**
+   * Creates and returns a TensorPtr instance using the properties set in this
+   * TensorPtrMaker.
+   *
+   * @return A TensorPtr instance that manages the newly created Tensor.
+   */
+  TensorPtr make_tensor_ptr() && {
+    return ::executorch::extension::make_tensor_ptr(
+        std::move(sizes_),
+        data_,
+        std::move(dim_order_),
+        std::move(strides_),
+        type_,
+        dynamism_,
+        std::move(deleter_));
+  }
+
+  /**
+   * Implicit conversion operator to create a TensorPtr.
+   *
+   * @return A TensorPtr instance that manages the newly created Tensor.
+   */
+  operator TensorPtr() && {
+    return std::move(*this).make_tensor_ptr();
+  }
+
+ private:
+  TensorPtrMaker(
+      void* data,
+      std::vector<exec_aten::SizesType> sizes,
+      exec_aten::ScalarType type)
+      : sizes_(std::move(sizes)), data_(data), type_(type) {}
+
+ private:
+  // The following properties are required to create a Tensor.
+  friend TensorPtrMaker for_blob(
+      void* data,
+      std::vector<exec_aten::SizesType> sizes,
+      exec_aten::ScalarType type);
+
+ private:
+  std::vector<exec_aten::SizesType> sizes_;
+  std::vector<exec_aten::StridesType> strides_;
+  std::vector<exec_aten::DimOrderType> dim_order_;
+  std::function<void(void*)> deleter_ = nullptr;
+  void* data_ = nullptr;
+  exec_aten::ScalarType type_ = exec_aten::ScalarType::Float;
+  exec_aten::TensorShapeDynamism dynamism_ =
+      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND;
+};
+
+/**
+ * Creates a TensorPtrMaker instance for building a TensorPtr from a raw data
+ * pointer and tensor sizes.
+ *
+ * The TensorPtrMaker returned by this function allows for further customization
+ * of the tensor's properties, such as data type, dimension order, strides, and
+ * shape dynamism, before finalizing the TensorPtr creation.
+ *
+ * @param data A pointer to the raw data to be used by the tensor. It must
+ * outlive the TensorPtr created by this function.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @return A TensorPtrMaker instance for creating a TensorPtr.
+ */
+inline TensorPtrMaker for_blob(
+    void* data,
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float) {
+  return TensorPtrMaker(data, std::move(sizes), type);
+}
+
+/**
+ * Creates a TensorPtr from a raw data pointer and tensor sizes, with an
+ * optional dynamism setting.
+ *
+ * This function provides a convenient way to create a tensor from existing
+ * data, with the option to specify whether the tensor's shape is static or
+ * dynamic.
+ *
+ * @param data A pointer to the raw data used by the tensor. The data must
+ * outlive the TensorPtr created by this function.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr from_blob(
+    void* data,
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return for_blob(data, std::move(sizes), type)
+      .dynamism(dynamism)
+      .make_tensor_ptr();
+}
+
+/**
+ * Creates a TensorPtr from a raw data pointer, tensor sizes, and strides, with
+ * an optional dynamism setting.
+ *
+ * This function allows for the creation of a tensor from existing data, with
+ * the option to specify custom strides for each dimension and whether the
+ * tensor’s shape is static, dynamic, or bounded.
+ *
+ * @param data A pointer to the raw data used by the tensor. The data must
+ * outlive the TensorPtr created by this function.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static, dynamic, or
+ * bounded.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr from_blob(
+    void* data,
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return for_blob(data, std::move(sizes), type)
+      .strides(std::move(strides))
+      .dynamism(dynamism)
+      .make_tensor_ptr();
+}
+
+/**
+ * Creates a TensorPtr from a raw data pointer and tensor sizes, with an
+ * optional dynamism setting.
+ *
+ * This function is a convenient way to create a tensor from existing data, with
+ * the option to specify whether the tensor's shape is static, dynamic, or
+ * bounded.
+ *
+ * @param data A pointer to the raw data to be used by the tensor. It must
+ * outlive the TensorPtr created by this function.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param deleter A function to delete the data when it's no longer needed.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance that manages the newly created Tensor.
+ */
+inline TensorPtr from_blob(
+    void* data,
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type,
+    std::function<void(void*)>&& deleter,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return for_blob(data, std::move(sizes), type)
+      .deleter(std::move(deleter))
+      .dynamism(dynamism)
+      .make_tensor_ptr();
+}
+
+/**
+ * Creates a TensorPtr from a raw data pointer, tensor sizes, and strides, with
+ * an optional dynamism setting.
+ *
+ * This function allows for the creation of a tensor from existing data, with
+ * the option to specify custom strides for each dimension and whether the
+ * tensor's shape is static, dynamic, or bounded.
+ *
+ * @param data A pointer to the raw data to be used by the tensor. It must
+ * outlive the TensorPtr created by this function.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param deleter A function to delete the data when it's no longer needed.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance that manages the newly created Tensor.
+ */
+inline TensorPtr from_blob(
+    void* data,
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    std::function<void(void*)>&& deleter,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return for_blob(data, std::move(sizes), type)
+      .strides(std::move(strides))
+      .deleter(std::move(deleter))
+      .dynamism(dynamism)
+      .make_tensor_ptr();
+}
+
+/**
+ * Creates a TensorPtr with the specified sizes, strides, and properties.
+ *
+ * This function allocates memory for the tensor elements but does not
+ * initialize them with any specific values. The tensor is created with the
+ * specified strides.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+TensorPtr empty_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates an empty TensorPtr with the same size and properties as the given
+ * tensor.
+ *
+ * This function allocates memory for the tensor elements but does not
+ * initialize them with any specific values.
+ *
+ * @param other A reference to another tensor, whose size and properties are
+ * used.
+ * @param type The scalar type of the tensor elements. If not provided, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr empty_like(
+    const TensorPtr& other,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == exec_aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return empty_strided(
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      type,
+      dynamism);
+}
+
+/**
+ * Creates an empty TensorPtr with the specified sizes and properties.
+ *
+ * This function allocates memory for the tensor elements but does not
+ * initialize them with any specific values.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr empty(
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return empty_strided(std::move(sizes), {}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with the specified value.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param fill_value The value to fill the tensor with.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+TensorPtr full_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::Scalar fill_value,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr filled with the specified value, with the same size and
+ * properties as another tensor.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param fill_value The value to fill the tensor with.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr full_like(
+    const TensorPtr& other,
+    exec_aten::Scalar fill_value,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == exec_aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return full_strided(
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      fill_value,
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with the specified value.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param fill_value The value used to fill the tensor.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr full(
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::Scalar fill_value,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full_strided(std::move(sizes), {}, fill_value, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr holding a scalar value.
+ *
+ * @param value The scalar value for the tensor.
+ * @param type The scalar type of the tensor elements.
+ * @return A TensorPtr instance managing the newly created scalar Tensor.
+ */
+inline TensorPtr scalar_tensor(
+    exec_aten::Scalar value,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float) {
+  return full({}, value, type);
+}
+
+/**
+ * Creates a TensorPtr filled with ones, with the same size and properties as
+ * another tensor.
+ *
+ * @param other A reference to another tensor, whose size and properties are
+ * used.
+ * @param type The scalar type of the tensor elements. If not provided, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr ones_like(
+    const TensorPtr& other,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full_like(other, 1, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with ones.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr ones(
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full(std::move(sizes), 1, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with zeros, with the same size and properties as
+ * another tensor.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the `other` tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr zeros_like(
+    const TensorPtr& other,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full_like(other, 0, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with zeros.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr zeros(
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full(std::move(sizes), 0, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random values between 0 and 1.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ **/
+TensorPtr rand_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr filled with random values between 0 and 1.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr rand_like(
+    const TensorPtr& other,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == exec_aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return rand_strided(
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random values between 0 and 1.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr rand(
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return rand_strided(std::move(sizes), {}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random values between 0 and 1, with specified
+ * strides.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+TensorPtr randn_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr filled with random values from a normal distribution.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr randn_like(
+    const TensorPtr& other,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == exec_aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return randn_strided(
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random values sampled from a normal
+ * distribution.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr randn(
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return randn_strided(std::move(sizes), {}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random integer values in the given range.
+ *
+ * @param low The lower bound (inclusive) of the random values.
+ * @param high The upper bound (exclusive) of the random values.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+TensorPtr randint_strided(
+    int64_t low,
+    int64_t high,
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Int,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr filled with random integer values in the given range.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param low The lower bound (inclusive) of the random values.
+ * @param high The upper bound (exclusive) of the random values.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr randint_like(
+    const TensorPtr& other,
+    int64_t low,
+    int64_t high,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == exec_aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return randint_strided(
+      low,
+      high,
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random integer values within the specified
+ * range.
+ *
+ * @param low The inclusive lower bound of the random values.
+ * @param high The exclusive upper bound of the random values.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr randint(
+    int64_t low,
+    int64_t high,
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Int,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return randint_strided(low, high, std::move(sizes), {}, type, dynamism);
+}
+
+} // namespace extension
+} // namespace executorch
diff --git a/extension/tensor/test/CMakeLists.txt b/extension/tensor/test/CMakeLists.txt
new file mode 100644
index 00000000000..132a40c31ba
--- /dev/null
+++ b/extension/tensor/test/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# @generated by test/utils/generate_gtest_cmakelists.py
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+cmake_minimum_required(VERSION 3.19)
+project(extension_tensor_test)
+
+# Use C++17 for test.
+set(CMAKE_CXX_STANDARD 17)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+
+include(${EXECUTORCH_ROOT}/build/Test.cmake)
+
+set(_test_srcs tensor_impl_ptr_test.cpp tensor_ptr_maker_test.cpp
+               tensor_ptr_test.cpp
+)
+
+et_cxx_test(
+  extension_tensor_test SOURCES ${_test_srcs} EXTRA_LIBS extension_tensor
+)
diff --git a/examples/sdk/sdk_example_runner/TARGETS b/extension/tensor/test/TARGETS
similarity index 100%
rename from examples/sdk/sdk_example_runner/TARGETS
rename to extension/tensor/test/TARGETS
diff --git a/extension/tensor/test/targets.bzl b/extension/tensor/test/targets.bzl
new file mode 100644
index 00000000000..632cc3fb88e
--- /dev/null
+++ b/extension/tensor/test/targets.bzl
@@ -0,0 +1,23 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    for aten_mode in (True, False):
+        aten_suffix = ("_aten" if aten_mode else "")
+
+        runtime.cxx_test(
+            name = "test" + aten_suffix,
+            srcs = [
+                "tensor_impl_ptr_test.cpp",
+                "tensor_ptr_maker_test.cpp",
+                "tensor_ptr_test.cpp",
+            ],
+            deps = [
+                "//executorch/extension/tensor:tensor" + aten_suffix,
+            ],
+        )
diff --git a/extension/tensor/test/tensor_impl_ptr_test.cpp b/extension/tensor/test/tensor_impl_ptr_test.cpp
new file mode 100644
index 00000000000..d3d827a4955
--- /dev/null
+++ b/extension/tensor/test/tensor_impl_ptr_test.cpp
@@ -0,0 +1,493 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/tensor/tensor_impl_ptr.h>
+
+#include <gtest/gtest.h>
+
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/test/utils/DeathTest.h>
+
+using namespace executorch::extension;
+using namespace executorch::runtime;
+
+class TensorImplPtrTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    runtime_init();
+  }
+};
+
+TEST_F(TensorImplPtrTest, ScalarTensorCreation) {
+  float scalar_data = 3.14f;
+  auto tensor_impl = make_tensor_impl_ptr({}, &scalar_data);
+
+  EXPECT_EQ(tensor_impl->numel(), 1);
+  EXPECT_EQ(tensor_impl->dim(), 0);
+  EXPECT_EQ(tensor_impl->sizes().size(), 0);
+  EXPECT_EQ(tensor_impl->strides().size(), 0);
+  EXPECT_EQ((float*)tensor_impl->data(), &scalar_data);
+  EXPECT_EQ(((float*)tensor_impl->data())[0], 3.14f);
+}
+
+TEST_F(TensorImplPtrTest, ScalarTensorOwningData) {
+  auto tensor_impl = make_tensor_impl_ptr({}, {3.14f});
+
+  EXPECT_EQ(tensor_impl->numel(), 1);
+  EXPECT_EQ(tensor_impl->dim(), 0);
+  EXPECT_EQ(tensor_impl->sizes().size(), 0);
+  EXPECT_EQ(tensor_impl->strides().size(), 0);
+  EXPECT_EQ(((float*)tensor_impl->data())[0], 3.14f);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplCreation) {
+  float data[20] = {2};
+  auto tensor_impl = make_tensor_impl_ptr({4, 5}, data, {0, 1}, {5, 1});
+
+  EXPECT_EQ(tensor_impl->dim(), 2);
+  EXPECT_EQ(tensor_impl->size(0), 4);
+  EXPECT_EQ(tensor_impl->size(1), 5);
+  EXPECT_EQ(tensor_impl->strides()[0], 5);
+  EXPECT_EQ(tensor_impl->strides()[1], 1);
+  EXPECT_EQ(tensor_impl->data(), data);
+  EXPECT_EQ(tensor_impl->data(), data);
+  EXPECT_EQ(((float*)tensor_impl->data())[0], 2);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplSharedOwnership) {
+  float data[20] = {2};
+  auto tensor_impl1 = make_tensor_impl_ptr({4, 5}, data);
+  auto tensor_impl2 = tensor_impl1;
+
+  EXPECT_EQ(tensor_impl1.get(), tensor_impl2.get());
+  EXPECT_EQ(tensor_impl1.use_count(), tensor_impl2.use_count());
+
+  tensor_impl1.reset();
+  EXPECT_EQ(tensor_impl2.use_count(), 1);
+  EXPECT_NE(tensor_impl2.get(), nullptr);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplInferredDimOrderAndStrides) {
+  float data[12] = {0};
+  auto tensor_impl = make_tensor_impl_ptr({3, 4}, data, {}, {4, 1});
+
+  EXPECT_EQ(tensor_impl->dim(), 2);
+  EXPECT_EQ(tensor_impl->size(0), 3);
+  EXPECT_EQ(tensor_impl->size(1), 4);
+  EXPECT_EQ(tensor_impl->strides()[0], 4);
+  EXPECT_EQ(tensor_impl->strides()[1], 1);
+  EXPECT_EQ(tensor_impl->data(), data);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplInferredDimOrderCustomStrides) {
+  float data[12] = {0};
+  auto tensor_impl = make_tensor_impl_ptr({3, 4}, data, {}, {1, 3});
+
+  EXPECT_EQ(tensor_impl->dim(), 2);
+  EXPECT_EQ(tensor_impl->size(0), 3);
+  EXPECT_EQ(tensor_impl->size(1), 4);
+  EXPECT_EQ(tensor_impl->strides()[0], 1);
+  EXPECT_EQ(tensor_impl->strides()[1], 3);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplDefaultDimOrderAndStrides) {
+  float data[24] = {0};
+  auto tensor_impl = make_tensor_impl_ptr({2, 3, 4}, data);
+
+  EXPECT_EQ(tensor_impl->dim(), 3);
+  EXPECT_EQ(tensor_impl->size(0), 2);
+  EXPECT_EQ(tensor_impl->size(1), 3);
+  EXPECT_EQ(tensor_impl->size(2), 4);
+  EXPECT_EQ(tensor_impl->strides()[0], 12);
+  EXPECT_EQ(tensor_impl->strides()[1], 4);
+  EXPECT_EQ(tensor_impl->strides()[2], 1);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplMismatchStridesAndDimOrder) {
+  float data[12] = {0};
+  ET_EXPECT_DEATH(
+      { auto _ = make_tensor_impl_ptr({3, 4}, data, {1, 0}, {1, 4}); }, "");
+}
+
+TEST_F(TensorImplPtrTest, TensorImplCustomDimOrderAndStrides) {
+  float data[12] = {0};
+  auto tensor_impl = make_tensor_impl_ptr({3, 4}, data, {1, 0}, {1, 3});
+
+  EXPECT_EQ(tensor_impl->dim(), 2);
+  EXPECT_EQ(tensor_impl->size(0), 3);
+  EXPECT_EQ(tensor_impl->size(1), 4);
+  EXPECT_EQ(tensor_impl->strides()[0], 1);
+  EXPECT_EQ(tensor_impl->strides()[1], 3);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplInvalidDimOrder) {
+  ET_EXPECT_DEATH(
+      {
+        float data[20] = {2};
+        auto _ = make_tensor_impl_ptr({4, 5}, data, {2, 1}, {1, 4});
+      },
+      "");
+}
+
+TEST_F(TensorImplPtrTest, TensorImplCustomDeleter) {
+  float data[20] = {4};
+  auto tensor_impl = make_tensor_impl_ptr({4, 5}, data);
+
+  TensorImplPtr copied_tensor_impl = tensor_impl;
+  EXPECT_EQ(tensor_impl.use_count(), copied_tensor_impl.use_count());
+
+  tensor_impl.reset();
+  EXPECT_EQ(copied_tensor_impl.use_count(), 1);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplDataDeleterReleasesCapturedSharedPtr) {
+  auto deleter_called = false;
+  std::shared_ptr<float[]> data_ptr(
+      new float[10], [](float* ptr) { delete[] ptr; });
+  auto tensor_impl = make_tensor_impl_ptr(
+      {4, 5},
+      data_ptr.get(),
+      {},
+      {},
+      exec_aten::ScalarType::Float,
+      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [data_ptr, &deleter_called](void*) mutable { deleter_called = true; });
+
+  EXPECT_EQ(data_ptr.use_count(), 2);
+
+  tensor_impl.reset();
+  EXPECT_TRUE(deleter_called);
+  EXPECT_EQ(data_ptr.use_count(), 1);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplOwningData) {
+  auto tensor_impl = make_tensor_impl_ptr(
+      {2, 5},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f},
+      {1, 0},
+      {1, 2});
+
+  EXPECT_EQ(tensor_impl->dim(), 2);
+  EXPECT_EQ(tensor_impl->size(0), 2);
+  EXPECT_EQ(tensor_impl->size(1), 5);
+  EXPECT_EQ(tensor_impl->strides()[0], 1);
+  EXPECT_EQ(tensor_impl->strides()[1], 2);
+  EXPECT_EQ(((float*)tensor_impl->data())[0], 1.0f);
+  EXPECT_EQ(((float*)tensor_impl->data())[9], 10.0f);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplOwningEmptyData) {
+  auto tensor_impl = make_tensor_impl_ptr({0, 5}, std::vector<float>());
+
+  EXPECT_EQ(tensor_impl->dim(), 2);
+  EXPECT_EQ(tensor_impl->size(0), 0);
+  EXPECT_EQ(tensor_impl->size(1), 5);
+  EXPECT_EQ(tensor_impl->strides()[0], 5);
+  EXPECT_EQ(tensor_impl->strides()[1], 1);
+  EXPECT_EQ(tensor_impl->data(), nullptr);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplDataOnlyDoubleType) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor_impl = make_tensor_impl_ptr(std::move(data));
+
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 4);
+  EXPECT_EQ(tensor_impl->strides()[0], 1);
+  EXPECT_EQ(((double*)tensor_impl->data())[0], 1.0);
+  EXPECT_EQ(((double*)tensor_impl->data())[3], 4.0);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplDataOnlyInt32Type) {
+  std::vector<int32_t> data = {10, 20, 30, 40};
+  auto tensor_impl = make_tensor_impl_ptr(std::move(data));
+
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 4);
+  EXPECT_EQ(tensor_impl->strides()[0], 1);
+  EXPECT_EQ(((int32_t*)tensor_impl->data())[0], 10);
+  EXPECT_EQ(((int32_t*)tensor_impl->data())[3], 40);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplDataOnlyInt64Type) {
+  std::vector<int64_t> data = {100, 200, 300, 400};
+  auto tensor_impl = make_tensor_impl_ptr(std::move(data));
+
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 4);
+  EXPECT_EQ(tensor_impl->strides()[0], 1);
+  EXPECT_EQ(((int64_t*)tensor_impl->data())[0], 100);
+  EXPECT_EQ(((int64_t*)tensor_impl->data())[3], 400);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplDataOnlyUint8Type) {
+  std::vector<uint8_t> data = {10, 20, 30, 40};
+  auto tensor_impl = make_tensor_impl_ptr(std::move(data));
+
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 4);
+  EXPECT_EQ(tensor_impl->strides()[0], 1);
+  EXPECT_EQ(((uint8_t*)tensor_impl->data())[0], 10);
+  EXPECT_EQ(((uint8_t*)tensor_impl->data())[3], 40);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplAmbiguityWithMixedVectors) {
+  std::vector<exec_aten::SizesType> sizes = {2, 2};
+  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
+  auto tensor_impl = make_tensor_impl_ptr(std::move(sizes), std::move(data));
+
+  EXPECT_EQ(tensor_impl->dim(), 2);
+  EXPECT_EQ(tensor_impl->size(0), 2);
+  EXPECT_EQ(tensor_impl->size(1), 2);
+  EXPECT_EQ(tensor_impl->strides()[0], 2);
+  EXPECT_EQ(tensor_impl->strides()[1], 1);
+  EXPECT_EQ(((float*)tensor_impl->data())[0], 1.0f);
+  EXPECT_EQ(((float*)tensor_impl->data())[3], 4.0f);
+
+  auto tensor_impl2 = make_tensor_impl_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f});
+
+  EXPECT_EQ(tensor_impl2->dim(), 2);
+  EXPECT_EQ(tensor_impl2->size(0), 2);
+  EXPECT_EQ(tensor_impl2->size(1), 2);
+  EXPECT_EQ(tensor_impl2->strides()[0], 2);
+  EXPECT_EQ(tensor_impl2->strides()[1], 1);
+  EXPECT_EQ(((float*)tensor_impl2->data())[0], 1.0f);
+  EXPECT_EQ(((float*)tensor_impl2->data())[3], 4.0f);
+}
+
+TEST_F(TensorImplPtrTest, SharedDataManagement) {
+  auto data = std::make_shared<std::vector<float>>(100, 1.0f);
+  auto tensor_impl1 = make_tensor_impl_ptr({10, 10}, data->data());
+  auto tensor_impl2 = tensor_impl1;
+
+  EXPECT_EQ(tensor_impl1.get(), tensor_impl2.get());
+  EXPECT_EQ(tensor_impl1.use_count(), 2);
+  EXPECT_EQ(((float*)tensor_impl1->data())[0], 1.0f);
+
+  ((float*)tensor_impl1->mutable_data())[0] = 2.0f;
+  EXPECT_EQ(((float*)tensor_impl2->data())[0], 2.0f);
+
+  tensor_impl1.reset();
+  EXPECT_NE(tensor_impl2.get(), nullptr);
+  EXPECT_EQ(tensor_impl2.use_count(), 1);
+
+  EXPECT_EQ(((float*)tensor_impl2->data())[0], 2.0f);
+}
+
+TEST_F(TensorImplPtrTest, CustomDeleterWithSharedData) {
+  auto data = std::make_shared<std::vector<float>>(100, 1.0f);
+  bool deleter_called = false;
+  {
+    auto tensor_impl = make_tensor_impl_ptr(
+        {10, 10},
+        data->data(),
+        {},
+        {},
+        exec_aten::ScalarType::Float,
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+        [data, &deleter_called](void*) mutable {
+          deleter_called = true;
+          data.reset();
+        });
+
+    EXPECT_EQ(data.use_count(), 2);
+    EXPECT_FALSE(deleter_called);
+  }
+  EXPECT_TRUE(deleter_called);
+  EXPECT_EQ(data.use_count(), 1);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplDeducedScalarType) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor_impl = make_tensor_impl_ptr({2, 2}, std::move(data));
+
+  EXPECT_EQ(tensor_impl->dim(), 2);
+  EXPECT_EQ(tensor_impl->size(0), 2);
+  EXPECT_EQ(tensor_impl->size(1), 2);
+  EXPECT_EQ(tensor_impl->strides()[0], 2);
+  EXPECT_EQ(tensor_impl->strides()[1], 1);
+  EXPECT_EQ(((double*)tensor_impl->data())[0], 1.0);
+  EXPECT_EQ(((double*)tensor_impl->data())[3], 4.0);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplUint8BufferWithFloatScalarType) {
+  std::vector<uint8_t> data(
+      4 * exec_aten::elementSize(exec_aten::ScalarType::Float));
+
+  float* float_data = reinterpret_cast<float*>(data.data());
+  float_data[0] = 1.0f;
+  float_data[1] = 2.0f;
+  float_data[2] = 3.0f;
+  float_data[3] = 4.0f;
+
+  auto tensor_impl = make_tensor_impl_ptr({2, 2}, std::move(data));
+
+  EXPECT_EQ(tensor_impl->dim(), 2);
+  EXPECT_EQ(tensor_impl->size(0), 2);
+  EXPECT_EQ(tensor_impl->size(1), 2);
+  EXPECT_EQ(tensor_impl->strides()[0], 2);
+  EXPECT_EQ(tensor_impl->strides()[1], 1);
+
+  EXPECT_EQ(((float*)tensor_impl->data())[0], 1.0f);
+  EXPECT_EQ(((float*)tensor_impl->data())[1], 2.0f);
+  EXPECT_EQ(((float*)tensor_impl->data())[2], 3.0f);
+  EXPECT_EQ(((float*)tensor_impl->data())[3], 4.0f);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplUint8BufferTooSmallExpectDeath) {
+  std::vector<uint8_t> data(
+      2 * exec_aten::elementSize(exec_aten::ScalarType::Float));
+  ET_EXPECT_DEATH(
+      { auto tensor_impl = make_tensor_impl_ptr({2, 2}, std::move(data)); },
+      "");
+}
+
+TEST_F(TensorImplPtrTest, TensorImplUint8BufferTooLarge) {
+  std::vector<uint8_t> data(
+      4 * exec_aten::elementSize(exec_aten::ScalarType::Float));
+  auto tensor_impl = make_tensor_impl_ptr({2, 2}, std::move(data));
+
+  EXPECT_EQ(tensor_impl->dim(), 2);
+  EXPECT_EQ(tensor_impl->size(0), 2);
+  EXPECT_EQ(tensor_impl->size(1), 2);
+  EXPECT_EQ(tensor_impl->strides()[0], 2);
+  EXPECT_EQ(tensor_impl->strides()[1], 1);
+}
+
+TEST_F(TensorImplPtrTest, StridesAndDimOrderMustMatchSizes) {
+  float data[12] = {0};
+  ET_EXPECT_DEATH(
+      { auto _ = make_tensor_impl_ptr({3, 4}, data, {}, {1}); }, "");
+  ET_EXPECT_DEATH(
+      { auto _ = make_tensor_impl_ptr({3, 4}, data, {0}, {4, 1}); }, "");
+}
+
+TEST_F(TensorImplPtrTest, TensorDataCastingFromIntToFloat) {
+  std::vector<int32_t> int_data = {1, 2, 3, 4, 5, 6};
+  auto tensor_impl = make_tensor_impl_ptr(
+      {2, 3}, std::move(int_data), {}, {}, exec_aten::ScalarType::Float);
+
+  EXPECT_EQ(tensor_impl->dim(), 2);
+  EXPECT_EQ(tensor_impl->size(0), 2);
+  EXPECT_EQ(tensor_impl->size(1), 3);
+  EXPECT_EQ(tensor_impl->dtype(), exec_aten::ScalarType::Float);
+
+  auto data_ptr = static_cast<const float*>(tensor_impl->data());
+  EXPECT_FLOAT_EQ(data_ptr[0], 1.0f);
+  EXPECT_FLOAT_EQ(data_ptr[5], 6.0f);
+}
+
+TEST_F(TensorImplPtrTest, TensorDataCastingFromIntToDouble) {
+  std::vector<int32_t> int_data = {1, 2, 3};
+  auto tensor_impl =
+      make_tensor_impl_ptr(std::move(int_data), exec_aten::ScalarType::Double);
+
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 3);
+  EXPECT_EQ(tensor_impl->dtype(), exec_aten::ScalarType::Double);
+
+  auto data_ptr = static_cast<const double*>(tensor_impl->data());
+  EXPECT_DOUBLE_EQ(data_ptr[0], 1.0);
+  EXPECT_DOUBLE_EQ(data_ptr[1], 2.0);
+  EXPECT_DOUBLE_EQ(data_ptr[2], 3.0);
+}
+
+TEST_F(TensorImplPtrTest, TensorDataCastingInvalidCast) {
+  std::vector<float> float_data = {1.0f, 2.0f, 3.0f};
+  ET_EXPECT_DEATH(
+      {
+        auto _ = make_tensor_impl_ptr(
+            std::move(float_data), exec_aten::ScalarType::Int);
+      },
+      "");
+}
+
+TEST_F(TensorImplPtrTest, TensorDataCastingFromFloatToHalf) {
+  std::vector<float> float_data = {1.0f, 2.0f, 3.0f};
+  auto tensor_impl =
+      make_tensor_impl_ptr(std::move(float_data), exec_aten::ScalarType::Half);
+
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 3);
+  EXPECT_EQ(tensor_impl->dtype(), exec_aten::ScalarType::Half);
+
+  auto data_ptr = static_cast<const exec_aten::Half*>(tensor_impl->data());
+  EXPECT_EQ(static_cast<float>(data_ptr[0]), 1.0f);
+  EXPECT_EQ(static_cast<float>(data_ptr[1]), 2.0f);
+  EXPECT_EQ(static_cast<float>(data_ptr[2]), 3.0f);
+}
+
+TEST_F(TensorImplPtrTest, TensorDataCastingFromDoubleToFloat) {
+  std::vector<double> double_data = {1.1, 2.2, 3.3};
+  auto tensor_impl = make_tensor_impl_ptr(
+      std::move(double_data), exec_aten::ScalarType::Float);
+
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 3);
+  EXPECT_EQ(tensor_impl->dtype(), exec_aten::ScalarType::Float);
+
+  auto data_ptr = static_cast<const float*>(tensor_impl->data());
+  EXPECT_FLOAT_EQ(data_ptr[0], 1.1f);
+  EXPECT_FLOAT_EQ(data_ptr[1], 2.2f);
+  EXPECT_FLOAT_EQ(data_ptr[2], 3.3f);
+}
+
+TEST_F(TensorImplPtrTest, TensorDataCastingFromInt64ToInt32) {
+  std::vector<int64_t> int64_data = {10000000000, 20000000000, 30000000000};
+  auto tensor_impl =
+      make_tensor_impl_ptr(std::move(int64_data), exec_aten::ScalarType::Int);
+
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 3);
+  EXPECT_EQ(tensor_impl->dtype(), exec_aten::ScalarType::Int);
+
+  auto data_ptr = static_cast<const int32_t*>(tensor_impl->data());
+  // Since the values exceed int32_t range, they may overflow
+  // Here we just check that the cast was performed
+  EXPECT_NE(data_ptr[0], 10000000000); // Expected overflow
+}
+
+TEST_F(TensorImplPtrTest, TensorDataCastingFromFloatToBFloat16) {
+  std::vector<float> float_data = {1.0f, 2.0f, 3.0f};
+  auto tensor_impl = make_tensor_impl_ptr(
+      std::move(float_data), exec_aten::ScalarType::BFloat16);
+
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 3);
+  EXPECT_EQ(tensor_impl->dtype(), exec_aten::ScalarType::BFloat16);
+
+  auto data_ptr = static_cast<const exec_aten::BFloat16*>(tensor_impl->data());
+  EXPECT_EQ(static_cast<float>(data_ptr[0]), 1.0f);
+  EXPECT_EQ(static_cast<float>(data_ptr[1]), 2.0f);
+  EXPECT_EQ(static_cast<float>(data_ptr[2]), 3.0f);
+}
+
+TEST_F(TensorImplPtrTest, InitializerListDoubleToHalf) {
+  auto tensor_impl = make_tensor_impl_ptr<double>(
+      {1.5, 2.7, 3.14}, exec_aten::ScalarType::Half);
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 3);
+  EXPECT_EQ(tensor_impl->dtype(), exec_aten::ScalarType::Half);
+  auto data_ptr = static_cast<const exec_aten::Half*>(tensor_impl->data());
+  EXPECT_NEAR(static_cast<float>(data_ptr[0]), 1.5f, 0.01);
+  EXPECT_NEAR(static_cast<float>(data_ptr[1]), 2.7f, 0.01);
+  EXPECT_NEAR(static_cast<float>(data_ptr[2]), 3.14f, 0.01);
+}
+
+TEST_F(TensorImplPtrTest, InitializerListInt8ToInt64) {
+  auto tensor_impl =
+      make_tensor_impl_ptr<int8_t>({1, -2, 3, -4}, exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 4);
+  EXPECT_EQ(tensor_impl->dtype(), exec_aten::ScalarType::Long);
+  auto data_ptr = static_cast<const int64_t*>(tensor_impl->data());
+  EXPECT_EQ(data_ptr[0], 1);
+  EXPECT_EQ(data_ptr[1], -2);
+  EXPECT_EQ(data_ptr[2], 3);
+  EXPECT_EQ(data_ptr[3], -4);
+}
diff --git a/extension/tensor/test/tensor_ptr_maker_test.cpp b/extension/tensor/test/tensor_ptr_maker_test.cpp
new file mode 100644
index 00000000000..4bfc56338ec
--- /dev/null
+++ b/extension/tensor/test/tensor_ptr_maker_test.cpp
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+
+#include <gtest/gtest.h>
+
+#include <executorch/runtime/platform/runtime.h>
+
+using namespace ::executorch::extension;
+using namespace ::executorch::runtime;
+
+class TensorPtrMakerTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    runtime_init();
+  }
+};
+
+TEST_F(TensorPtrMakerTest, CreateTensorUsingTensorMaker) {
+  float data[20] = {2};
+  auto tensor = for_blob(data, {4, 5})
+                    .dim_order({0, 1})
+                    .strides({5, 1})
+                    .dynamism(exec_aten::TensorShapeDynamism::DYNAMIC_BOUND)
+                    .make_tensor_ptr();
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->strides()[0], 5);
+  EXPECT_EQ(tensor->strides()[1], 1);
+  EXPECT_EQ(tensor->const_data_ptr<float>(), data);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 2);
+}
+
+TEST_F(TensorPtrMakerTest, PerfectForwardingLValue) {
+  float data[20] = {2};
+  std::vector<exec_aten::SizesType> sizes = {4, 5};
+  std::vector<exec_aten::DimOrderType> dim_order = {0, 1};
+  std::vector<exec_aten::StridesType> strides = {5, 1};
+
+  auto tensor = for_blob(data, sizes)
+                    .dim_order(dim_order)
+                    .strides(strides)
+                    .make_tensor_ptr();
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->strides()[0], 5);
+  EXPECT_EQ(tensor->strides()[1], 1);
+
+  EXPECT_EQ(sizes.size(), 2);
+  EXPECT_EQ(dim_order.size(), 2);
+  EXPECT_EQ(strides.size(), 2);
+}
+
+TEST_F(TensorPtrMakerTest, PerfectForwardingRValue) {
+  float data[20] = {2};
+  std::vector<exec_aten::SizesType> sizes = {4, 5};
+  std::vector<exec_aten::DimOrderType> dim_order = {0, 1};
+  std::vector<exec_aten::StridesType> strides = {5, 1};
+
+  auto tensor = for_blob(data, std::move(sizes))
+                    .dim_order(std::move(dim_order))
+                    .strides(std::move(strides))
+                    .make_tensor_ptr();
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->strides()[0], 5);
+  EXPECT_EQ(tensor->strides()[1], 1);
+  // for_blob() moved the contents of the vectors, leaving these empty.
+  EXPECT_EQ(sizes.size(), 0); // NOLINT(bugprone-use-after-move)
+  EXPECT_EQ(dim_order.size(), 0); // NOLINT(bugprone-use-after-move)
+  EXPECT_EQ(strides.size(), 0); // NOLINT(bugprone-use-after-move)
+}
+
+TEST_F(TensorPtrMakerTest, CreateTensorFromBlob) {
+  float data[20] = {2};
+  auto tensor = from_blob(data, {4, 5});
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->strides()[0], 5);
+  EXPECT_EQ(tensor->strides()[1], 1);
+  EXPECT_EQ(tensor->const_data_ptr<float>(), data);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 2);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[19], 0);
+}
+
+TEST_F(TensorPtrMakerTest, CreateTensorUsingFromBlobWithStrides) {
+  float data[20] = {3};
+  auto tensor = from_blob(data, {2, 2, 2}, {4, 2, 1});
+
+  EXPECT_EQ(tensor->dim(), 3);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 2);
+  EXPECT_EQ(tensor->size(2), 2);
+  EXPECT_EQ(tensor->strides()[0], 4);
+  EXPECT_EQ(tensor->strides()[1], 2);
+  EXPECT_EQ(tensor->strides()[2], 1);
+  EXPECT_EQ(tensor->const_data_ptr<float>(), data);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 3);
+}
+
+TEST_F(TensorPtrMakerTest, TensorMakerConversionOperator) {
+  float data[20] = {2};
+  TensorPtr tensor =
+      for_blob(data, {4, 5})
+          .dynamism(exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+}
+
+TEST_F(TensorPtrMakerTest, CreateTensorWithZeroDimensions) {
+  float data[1] = {2};
+  auto tensor = from_blob(data, {});
+
+  EXPECT_EQ(tensor->dim(), 0);
+  EXPECT_EQ(tensor->numel(), 1);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 2);
+}
+
+TEST_F(TensorPtrMakerTest, TensorWithCustomDataDeleter) {
+  auto deleter_called = false;
+  float* data = new float[20]();
+  auto tensor = for_blob(data, {4, 5})
+                    .deleter([&deleter_called](void* ptr) {
+                      deleter_called = true;
+                      delete[] static_cast<float*>(ptr);
+                    })
+                    .make_tensor_ptr();
+
+  tensor.reset();
+  EXPECT_TRUE(deleter_called);
+}
+
+TEST_F(TensorPtrMakerTest, TensorManagesMovedVector) {
+  auto deleter_called = false;
+  std::vector<float> data(20, 3.0f);
+  auto* data_ptr = data.data();
+  auto tensor = for_blob(data_ptr, {4, 5})
+                    .deleter([moved_data = std::move(data), &deleter_called](
+                                 void*) mutable { deleter_called = true; })
+                    .make_tensor_ptr();
+
+  EXPECT_TRUE(data.empty()); // NOLINT(bugprone-use-after-move)
+  EXPECT_EQ(tensor->data_ptr<float>(), data_ptr);
+
+  tensor.reset();
+  EXPECT_TRUE(deleter_called);
+}
+
+TEST_F(TensorPtrMakerTest, TensorDeleterReleasesCapturedSharedPtr) {
+  auto deleter_called = false;
+  std::shared_ptr<float[]> data_ptr(
+      new float[10], [](float* ptr) { delete[] ptr; });
+  auto tensor = from_blob(
+      data_ptr.get(),
+      {4, 5},
+      exec_aten::ScalarType::Float,
+      [data_ptr, &deleter_called](void*) mutable { deleter_called = true; });
+
+  EXPECT_EQ(data_ptr.use_count(), 2);
+
+  tensor.reset();
+  EXPECT_TRUE(deleter_called);
+  EXPECT_EQ(data_ptr.use_count(), 1);
+}
+
+TEST_F(TensorPtrMakerTest, CreateEmpty) {
+  auto tensor = empty({4, 5});
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+
+  auto tensor2 = empty({4, 5}, exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->dim(), 2);
+  EXPECT_EQ(tensor2->size(0), 4);
+  EXPECT_EQ(tensor2->size(1), 5);
+  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+
+  auto tensor3 = empty({4, 5}, exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->dim(), 2);
+  EXPECT_EQ(tensor3->size(0), 4);
+  EXPECT_EQ(tensor3->size(1), 5);
+  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long);
+
+  auto tensor4 = empty({4, 5}, exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->dim(), 2);
+  EXPECT_EQ(tensor4->size(0), 4);
+  EXPECT_EQ(tensor4->size(1), 5);
+  EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double);
+}
+
+TEST_F(TensorPtrMakerTest, CreateFull) {
+  auto tensor = full({4, 5}, 7);
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 7);
+
+  auto tensor2 = full({4, 5}, 3, exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->dim(), 2);
+  EXPECT_EQ(tensor2->size(0), 4);
+  EXPECT_EQ(tensor2->size(1), 5);
+  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->const_data_ptr<int32_t>()[0], 3);
+
+  auto tensor3 = full({4, 5}, 9, exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->dim(), 2);
+  EXPECT_EQ(tensor3->size(0), 4);
+  EXPECT_EQ(tensor3->size(1), 5);
+  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->const_data_ptr<int64_t>()[0], 9);
+
+  auto tensor4 = full({4, 5}, 11, exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->dim(), 2);
+  EXPECT_EQ(tensor4->size(0), 4);
+  EXPECT_EQ(tensor4->size(1), 5);
+  EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->const_data_ptr<double>()[0], 11);
+}
+
+TEST_F(TensorPtrMakerTest, CreateScalar) {
+  auto tensor = scalar_tensor(3.14f);
+
+  EXPECT_EQ(tensor->dim(), 0);
+  EXPECT_EQ(tensor->numel(), 1);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 3.14f);
+
+  auto tensor2 = scalar_tensor(5, exec_aten::ScalarType::Int);
+
+  EXPECT_EQ(tensor2->dim(), 0);
+  EXPECT_EQ(tensor2->numel(), 1);
+  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->const_data_ptr<int32_t>()[0], 5);
+
+  auto tensor3 = scalar_tensor(7.0, exec_aten::ScalarType::Double);
+
+  EXPECT_EQ(tensor3->dim(), 0);
+  EXPECT_EQ(tensor3->numel(), 1);
+  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor3->const_data_ptr<double>()[0], 7.0);
+}
+
+TEST_F(TensorPtrMakerTest, CreateOnes) {
+  auto tensor = ones({4, 5});
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 1);
+
+  auto tensor2 = ones({4, 5}, exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->dim(), 2);
+  EXPECT_EQ(tensor2->size(0), 4);
+  EXPECT_EQ(tensor2->size(1), 5);
+  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->const_data_ptr<int32_t>()[0], 1);
+
+  auto tensor3 = ones({4, 5}, exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->dim(), 2);
+  EXPECT_EQ(tensor3->size(0), 4);
+  EXPECT_EQ(tensor3->size(1), 5);
+  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->const_data_ptr<int64_t>()[0], 1);
+
+  auto tensor4 = ones({4, 5}, exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->dim(), 2);
+  EXPECT_EQ(tensor4->size(0), 4);
+  EXPECT_EQ(tensor4->size(1), 5);
+  EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->const_data_ptr<double>()[0], 1);
+}
+
+TEST_F(TensorPtrMakerTest, CreateZeros) {
+  auto tensor = zeros({4, 5});
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 0);
+
+  auto tensor2 = zeros({4, 5}, exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->dim(), 2);
+  EXPECT_EQ(tensor2->size(0), 4);
+  EXPECT_EQ(tensor2->size(1), 5);
+  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->const_data_ptr<int32_t>()[0], 0);
+
+  auto tensor3 = zeros({4, 5}, exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->dim(), 2);
+  EXPECT_EQ(tensor3->size(0), 4);
+  EXPECT_EQ(tensor3->size(1), 5);
+  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->const_data_ptr<int64_t>()[0], 0);
+
+  auto tensor4 = zeros({4, 5}, exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->dim(), 2);
+  EXPECT_EQ(tensor4->size(0), 4);
+  EXPECT_EQ(tensor4->size(1), 5);
+  EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->const_data_ptr<double>()[0], 0);
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandTensor) {
+  auto tensor = rand({4, 5});
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<float>()[i];
+    EXPECT_GE(val, 0.0f);
+    EXPECT_LT(val, 1.0f);
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandTensorWithIntType) {
+  auto tensor = rand({4, 5}, exec_aten::ScalarType::Int);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<int32_t>()[i];
+    EXPECT_EQ(val, 0);
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandTensorWithDoubleType) {
+  auto tensor = rand({4, 5}, exec_aten::ScalarType::Double);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<double>()[i];
+    EXPECT_GE(val, 0.0);
+    EXPECT_LT(val, 1.0);
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandnTensor) {
+  auto tensor = randn({100, 100});
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 100);
+  EXPECT_EQ(tensor->size(1), 100);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+
+  auto sum = 0.0f;
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    sum += tensor->const_data_ptr<float>()[i];
+  }
+  const auto average = sum / tensor->numel();
+  EXPECT_NEAR(average, 0.0f, 1.0f);
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandnTensorWithDoubleType) {
+  auto tensor = randn({100, 100}, exec_aten::ScalarType::Double);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 100);
+  EXPECT_EQ(tensor->size(1), 100);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
+
+  auto sum = 0.0;
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    sum += tensor->const_data_ptr<double>()[i];
+  }
+  const auto average = sum / tensor->numel();
+  EXPECT_NEAR(average, 0.0, 1.0);
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandIntTensorWithIntType) {
+  auto tensor = randint(10, 20, {4, 5}, exec_aten::ScalarType::Int);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<int32_t>()[i];
+    EXPECT_GE(val, 10);
+    EXPECT_LT(val, 20);
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandIntTensorWithLongType) {
+  auto tensor = randint(10, 20, {4, 5}, exec_aten::ScalarType::Long);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<int64_t>()[i];
+    EXPECT_GE(val, 10);
+    EXPECT_LT(val, 20);
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandnTensorWithIntType) {
+  auto tensor = rand({4, 5}, exec_aten::ScalarType::Int);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<int32_t>()[i];
+    EXPECT_EQ(val, 0);
+  }
+}
diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp
new file mode 100644
index 00000000000..2473fc7ccd7
--- /dev/null
+++ b/extension/tensor/test/tensor_ptr_test.cpp
@@ -0,0 +1,605 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/tensor/tensor_ptr.h>
+
+#include <gtest/gtest.h>
+
+#include <executorch/runtime/platform/runtime.h>
+
+using namespace ::executorch::extension;
+using namespace ::executorch::runtime;
+
+class TensorPtrTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    runtime_init();
+  }
+};
+
+TEST_F(TensorPtrTest, ScalarTensorCreation) {
+  float scalar_data = 3.14f;
+  auto tensor = make_tensor_ptr({}, &scalar_data);
+
+  EXPECT_EQ(tensor->numel(), 1);
+  EXPECT_EQ(tensor->dim(), 0);
+  EXPECT_EQ(tensor->sizes().size(), 0);
+  EXPECT_EQ(tensor->strides().size(), 0);
+  EXPECT_EQ(tensor->const_data_ptr<float>(), &scalar_data);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 3.14f);
+}
+
+TEST_F(TensorPtrTest, ScalarTensorOwningData) {
+  auto tensor = make_tensor_ptr({}, {3.14f});
+
+  EXPECT_EQ(tensor->numel(), 1);
+  EXPECT_EQ(tensor->dim(), 0);
+  EXPECT_EQ(tensor->sizes().size(), 0);
+  EXPECT_EQ(tensor->strides().size(), 0);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 3.14f);
+}
+
+TEST_F(TensorPtrTest, ScalarTensorSingleValueCreation) {
+  auto tensor_float = make_tensor_ptr(3.14f);
+  EXPECT_EQ(tensor_float->dim(), 0);
+  EXPECT_EQ(tensor_float->numel(), 1);
+  EXPECT_EQ(tensor_float->sizes().size(), 0);
+  EXPECT_EQ(tensor_float->strides().size(), 0);
+  EXPECT_EQ(tensor_float->const_data_ptr<float>()[0], 3.14f);
+  EXPECT_EQ(tensor_float->scalar_type(), exec_aten::ScalarType::Float);
+
+  auto tensor_int32 = make_tensor_ptr(42);
+  EXPECT_EQ(tensor_int32->dim(), 0);
+  EXPECT_EQ(tensor_int32->numel(), 1);
+  EXPECT_EQ(tensor_int32->sizes().size(), 0);
+  EXPECT_EQ(tensor_int32->strides().size(), 0);
+  EXPECT_EQ(tensor_int32->const_data_ptr<int32_t>()[0], 42);
+  EXPECT_EQ(tensor_int32->scalar_type(), exec_aten::ScalarType::Int);
+
+  auto tensor_double = make_tensor_ptr(2.718);
+  EXPECT_EQ(tensor_double->dim(), 0);
+  EXPECT_EQ(tensor_double->numel(), 1);
+  EXPECT_EQ(tensor_double->sizes().size(), 0);
+  EXPECT_EQ(tensor_double->strides().size(), 0);
+  EXPECT_EQ(tensor_double->const_data_ptr<double>()[0], 2.718);
+  EXPECT_EQ(tensor_double->scalar_type(), exec_aten::ScalarType::Double);
+
+  auto tensor_int64 = make_tensor_ptr(static_cast<int64_t>(10000000000));
+  EXPECT_EQ(tensor_int64->dim(), 0);
+  EXPECT_EQ(tensor_int64->numel(), 1);
+  EXPECT_EQ(tensor_int64->sizes().size(), 0);
+  EXPECT_EQ(tensor_int64->strides().size(), 0);
+  EXPECT_EQ(tensor_int64->const_data_ptr<int64_t>()[0], 10000000000);
+  EXPECT_EQ(tensor_int64->scalar_type(), exec_aten::ScalarType::Long);
+}
+
+TEST_F(TensorPtrTest, CreateTensorWithStridesAndDimOrder) {
+  float data[20] = {2};
+  auto tensor = make_tensor_ptr({4, 5}, data, {0, 1}, {5, 1});
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->strides()[0], 5);
+  EXPECT_EQ(tensor->strides()[1], 1);
+  EXPECT_EQ(tensor->const_data_ptr<float>(), data);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 2);
+}
+
+TEST_F(TensorPtrTest, TensorSharingImpl) {
+  float data[20] = {2};
+  auto tensor1 = make_tensor_ptr({4, 5}, data);
+  auto tensor2 = make_tensor_ptr(tensor1);
+  EXPECT_EQ(tensor1->unsafeGetTensorImpl(), tensor2->unsafeGetTensorImpl());
+}
+
+TEST_F(TensorPtrTest, TensorImplLifetime) {
+  TensorPtr tensor;
+  EXPECT_EQ(tensor, nullptr);
+  {
+    float data[20] = {2};
+    auto tensor_impl = make_tensor_impl_ptr({4, 5}, data);
+    tensor = make_tensor_ptr(tensor_impl);
+  }
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+}
+
+TEST_F(TensorPtrTest, TensorWithZeroDimensionAndElements) {
+  float data[20] = {2};
+  auto tensor = make_tensor_ptr({}, data);
+  EXPECT_EQ(tensor->dim(), 0);
+  EXPECT_EQ(tensor->numel(), 1);
+  tensor = make_tensor_ptr({0, 5}, data);
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->numel(), 0);
+}
+
+TEST_F(TensorPtrTest, TensorResize) {
+  float data[20] = {2};
+  auto tensor = make_tensor_ptr(
+      {4, 5},
+      data,
+      {},
+      {},
+      exec_aten::ScalarType::Float,
+      exec_aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
+  EXPECT_EQ(resize_tensor_ptr(tensor, {5, 4}), Error::Ok);
+  EXPECT_EQ(tensor->size(0), 5);
+  EXPECT_EQ(tensor->size(1), 4);
+}
+
+TEST_F(TensorPtrTest, TensorDataAccess) {
+  float data[6] = {1, 2, 3, 4, 5, 6};
+  auto tensor = make_tensor_ptr({2, 3}, data);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 1);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[5], 6);
+  tensor->mutable_data_ptr<float>()[0] = 10;
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 10);
+}
+
+TEST_F(TensorPtrTest, TensorWithCustomDataDeleter) {
+  auto deleter_called = false;
+  float* data = new float[20]();
+  auto tensor = make_tensor_ptr(
+      {4, 5},
+      data,
+      {},
+      {},
+      exec_aten::ScalarType::Float,
+      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [&deleter_called](void* ptr) {
+        deleter_called = true;
+        delete[] static_cast<float*>(ptr);
+      });
+
+  tensor.reset();
+  EXPECT_TRUE(deleter_called);
+}
+
+TEST_F(TensorPtrTest, TensorManagesMovedVector) {
+  auto deleter_called = false;
+  std::vector<float> data(20, 3.0f);
+  auto* data_ptr = data.data();
+  auto tensor = make_tensor_ptr(
+      {4, 5},
+      data_ptr,
+      {},
+      {},
+      exec_aten::ScalarType::Float,
+      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [moved_data = std::move(data), &deleter_called](void*) mutable {
+        deleter_called = true;
+      });
+
+  EXPECT_TRUE(data.empty()); // NOLINT(bugprone-use-after-move)
+  EXPECT_EQ(tensor->data_ptr<float>(), data_ptr);
+
+  tensor.reset();
+  EXPECT_TRUE(deleter_called);
+}
+
+TEST_F(TensorPtrTest, TensorDeleterReleasesCapturedSharedPtr) {
+  auto deleter_called = false;
+  std::shared_ptr<float[]> data_ptr(
+      new float[10], [](float* ptr) { delete[] ptr; });
+  auto tensor = make_tensor_ptr(
+      {4, 5},
+      data_ptr.get(),
+      {},
+      {},
+      exec_aten::ScalarType::Float,
+      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [data_ptr, &deleter_called](void*) mutable { deleter_called = true; });
+
+  EXPECT_EQ(data_ptr.use_count(), 2);
+
+  tensor.reset();
+  EXPECT_TRUE(deleter_called);
+  EXPECT_EQ(data_ptr.use_count(), 1);
+}
+
+TEST_F(TensorPtrTest, TensorOwningData) {
+  auto tensor = make_tensor_ptr(
+      {2, 5},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f},
+      {1, 0},
+      {1, 2});
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->strides()[1], 2);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 1.0f);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[9], 10.0f);
+}
+
+TEST_F(TensorPtrTest, TensorOwningEmptyData) {
+  auto tensor = make_tensor_ptr({0, 5}, std::vector<float>());
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 0);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->strides()[0], 5);
+  EXPECT_EQ(tensor->strides()[1], 1);
+  EXPECT_EQ(tensor->data_ptr<float>(), nullptr);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+}
+
+TEST_F(TensorPtrTest, TensorImplDataOnly) {
+  auto tensor = make_tensor_ptr({1.0f, 2.0f, 3.0f, 4.0f});
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 1.0);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[3], 4.0);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+}
+
+TEST_F(TensorPtrTest, TensorImplDataOnlyDoubleType) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor = make_tensor_ptr(std::move(data));
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->const_data_ptr<double>()[0], 1.0);
+  EXPECT_EQ(tensor->const_data_ptr<double>()[3], 4.0);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
+}
+
+TEST_F(TensorPtrTest, TensorImplDataOnlyInt32Type) {
+  std::vector<int32_t> data = {10, 20, 30, 40};
+  auto tensor = make_tensor_ptr(std::move(data));
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->const_data_ptr<int32_t>()[0], 10);
+  EXPECT_EQ(tensor->const_data_ptr<int32_t>()[3], 40);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+}
+
+TEST_F(TensorPtrTest, TensorImplDataOnlyInt64Type) {
+  std::vector<int64_t> data = {100, 200, 300, 400};
+  auto tensor = make_tensor_ptr(std::move(data));
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->const_data_ptr<int64_t>()[0], 100);
+  EXPECT_EQ(tensor->const_data_ptr<int64_t>()[3], 400);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long);
+}
+
+TEST_F(TensorPtrTest, TensorImplDataOnlyUint8Type) {
+  std::vector<uint8_t> data = {10, 20, 30, 40};
+  auto tensor = make_tensor_ptr(std::move(data));
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->const_data_ptr<uint8_t>()[0], 10);
+  EXPECT_EQ(tensor->const_data_ptr<uint8_t>()[3], 40);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Byte);
+}
+
+TEST_F(TensorPtrTest, TensorImplAmbiguityWithMixedVectors) {
+  std::vector<exec_aten::SizesType> sizes = {2, 2};
+  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
+  auto tensor = make_tensor_ptr(std::move(sizes), std::move(data));
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 2);
+  EXPECT_EQ(tensor->strides()[0], 2);
+  EXPECT_EQ(tensor->strides()[1], 1);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 1.0f);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[3], 4.0f);
+
+  auto tensor2 = make_tensor_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f});
+
+  EXPECT_EQ(tensor2->dim(), 2);
+  EXPECT_EQ(tensor2->size(0), 2);
+  EXPECT_EQ(tensor2->size(1), 2);
+  EXPECT_EQ(tensor2->strides()[0], 2);
+  EXPECT_EQ(tensor2->strides()[1], 1);
+  EXPECT_EQ(tensor2->const_data_ptr<float>()[0], 1.0f);
+  EXPECT_EQ(tensor2->const_data_ptr<float>()[3], 4.0f);
+}
+
+TEST_F(TensorPtrTest, TensorSharingImplModifiesSharedDataVector) {
+  std::vector<float> data = {1, 2, 3, 4, 5, 6};
+
+  auto tensor1 = make_tensor_ptr({2, 3}, std::move(data));
+  auto tensor2 = make_tensor_ptr(tensor1);
+
+  tensor1->mutable_data_ptr<float>()[0] = 10;
+  EXPECT_EQ(tensor2->const_data_ptr<float>()[0], 10);
+
+  tensor2->mutable_data_ptr<float>()[5] = 20;
+  EXPECT_EQ(tensor1->const_data_ptr<float>()[5], 20);
+}
+
+TEST_F(TensorPtrTest, TensorSharingImplResizingAffectsBothVector) {
+  std::vector<float> data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+
+  auto tensor1 = make_tensor_ptr({3, 4}, std::move(data));
+  auto tensor2 = make_tensor_ptr(tensor1);
+
+  EXPECT_EQ(resize_tensor_ptr(tensor1, {2, 6}), Error::Ok);
+  EXPECT_EQ(tensor2->size(0), 2);
+  EXPECT_EQ(tensor2->size(1), 6);
+
+  EXPECT_EQ(resize_tensor_ptr(tensor2, {4, 3}), Error::Ok);
+  EXPECT_EQ(tensor1->size(0), 4);
+  EXPECT_EQ(tensor1->size(1), 3);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt32) {
+  std::vector<int32_t> data = {1, 2, 3, 4};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(*tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<int32_t>(), tensor->const_data_ptr<int32_t>());
+  EXPECT_EQ(new_tensor->scalar_type(), exec_aten::ScalarType::Int);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt32) {
+  std::vector<int32_t> data = {1, 2, 3, 4};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned_tensor = clone_tensor_ptr(*tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_NE(
+      cloned_tensor->const_data_ptr<int32_t>(),
+      tensor->const_data_ptr<int32_t>());
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int32_t>()[0], 1);
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int32_t>()[3], 4);
+  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Int);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt32) {
+  std::vector<int32_t> data = {1, 2, 3, 4};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned_tensor = clone_tensor_ptr(tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_NE(
+      cloned_tensor->const_data_ptr<int32_t>(),
+      tensor->const_data_ptr<int32_t>());
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int32_t>()[0], 1);
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int32_t>()[3], 4);
+  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Int);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorDouble) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(*tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<double>(), tensor->const_data_ptr<double>());
+  EXPECT_EQ(new_tensor->scalar_type(), exec_aten::ScalarType::Double);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorDouble) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned_tensor = clone_tensor_ptr(*tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_NE(
+      cloned_tensor->const_data_ptr<double>(),
+      tensor->const_data_ptr<double>());
+  EXPECT_EQ(cloned_tensor->const_data_ptr<double>()[0], 1.0);
+  EXPECT_EQ(cloned_tensor->const_data_ptr<double>()[3], 4.0);
+  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Double);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrDouble) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned_tensor = clone_tensor_ptr(tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_NE(
+      cloned_tensor->const_data_ptr<double>(),
+      tensor->const_data_ptr<double>());
+  EXPECT_EQ(cloned_tensor->const_data_ptr<double>()[0], 1.0);
+  EXPECT_EQ(cloned_tensor->const_data_ptr<double>()[3], 4.0);
+  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Double);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt64) {
+  std::vector<int64_t> data = {100, 200, 300, 400};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(*tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<int64_t>(), tensor->const_data_ptr<int64_t>());
+  EXPECT_EQ(new_tensor->scalar_type(), exec_aten::ScalarType::Long);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt64) {
+  std::vector<int64_t> data = {100, 200, 300, 400};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned_tensor = clone_tensor_ptr(*tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_NE(
+      cloned_tensor->const_data_ptr<int64_t>(),
+      tensor->const_data_ptr<int64_t>());
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int64_t>()[0], 100);
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int64_t>()[3], 400);
+  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Long);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt64) {
+  std::vector<int64_t> data = {100, 200, 300, 400};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned_tensor = clone_tensor_ptr(tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_NE(
+      cloned_tensor->const_data_ptr<int64_t>(),
+      tensor->const_data_ptr<int64_t>());
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int64_t>()[0], 100);
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int64_t>()[3], 400);
+  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Long);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrNull) {
+  auto tensor = make_tensor_ptr({2, 2}, nullptr);
+  auto cloned_tensor = clone_tensor_ptr(tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(cloned_tensor->const_data_ptr(), tensor->const_data_ptr());
+  EXPECT_EQ(cloned_tensor->const_data_ptr(), nullptr);
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromIntToFloat) {
+  std::vector<int32_t> int_data = {1, 2, 3, 4, 5, 6};
+  auto tensor = make_tensor_ptr(
+      {2, 3}, std::move(int_data), {}, {}, exec_aten::ScalarType::Float);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+
+  auto data_ptr = tensor->const_data_ptr<float>();
+  EXPECT_FLOAT_EQ(data_ptr[0], 1.0f);
+  EXPECT_FLOAT_EQ(data_ptr[5], 6.0f);
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromIntToDouble) {
+  std::vector<int32_t> int_data = {1, 2, 3};
+  auto tensor =
+      make_tensor_ptr(std::move(int_data), exec_aten::ScalarType::Double);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
+
+  auto data_ptr = tensor->const_data_ptr<double>();
+  EXPECT_DOUBLE_EQ(data_ptr[0], 1.0);
+  EXPECT_DOUBLE_EQ(data_ptr[1], 2.0);
+  EXPECT_DOUBLE_EQ(data_ptr[2], 3.0);
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromFloatToHalf) {
+  std::vector<float> float_data = {1.0f, 2.0f, 3.0f};
+  auto tensor =
+      make_tensor_ptr(std::move(float_data), exec_aten::ScalarType::Half);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Half);
+
+  auto data_ptr = tensor->const_data_ptr<exec_aten::Half>();
+  EXPECT_EQ(static_cast<float>(data_ptr[0]), 1.0f);
+  EXPECT_EQ(static_cast<float>(data_ptr[1]), 2.0f);
+  EXPECT_EQ(static_cast<float>(data_ptr[2]), 3.0f);
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromDoubleToFloat) {
+  std::vector<double> double_data = {1.1, 2.2, 3.3};
+  auto tensor =
+      make_tensor_ptr(std::move(double_data), exec_aten::ScalarType::Float);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+
+  auto data_ptr = tensor->const_data_ptr<float>();
+  EXPECT_FLOAT_EQ(data_ptr[0], 1.1f);
+  EXPECT_FLOAT_EQ(data_ptr[1], 2.2f);
+  EXPECT_FLOAT_EQ(data_ptr[2], 3.3f);
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromInt64ToInt32) {
+  std::vector<int64_t> int64_data = {10000000000, 20000000000, 30000000000};
+  auto tensor =
+      make_tensor_ptr(std::move(int64_data), exec_aten::ScalarType::Int);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+
+  auto data_ptr = tensor->const_data_ptr<int32_t>();
+  EXPECT_NE(data_ptr[0], 10000000000); // Expected overflow
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromFloatToBFloat16) {
+  std::vector<float> float_data = {1.0f, 2.0f, 3.0f};
+  auto tensor =
+      make_tensor_ptr(std::move(float_data), exec_aten::ScalarType::BFloat16);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::BFloat16);
+
+  auto data_ptr = tensor->const_data_ptr<exec_aten::BFloat16>();
+  EXPECT_EQ(static_cast<float>(data_ptr[0]), 1.0f);
+  EXPECT_EQ(static_cast<float>(data_ptr[1]), 2.0f);
+  EXPECT_EQ(static_cast<float>(data_ptr[2]), 3.0f);
+}
+
+TEST_F(TensorPtrTest, InitializerListDoubleToHalf) {
+  auto tensor =
+      make_tensor_ptr<double>({1.5, 2.7, 3.14}, exec_aten::ScalarType::Half);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Half);
+
+  auto data_ptr = tensor->const_data_ptr<exec_aten::Half>();
+  EXPECT_NEAR(static_cast<float>(data_ptr[0]), 1.5f, 0.01);
+  EXPECT_NEAR(static_cast<float>(data_ptr[1]), 2.7f, 0.01);
+  EXPECT_NEAR(static_cast<float>(data_ptr[2]), 3.14f, 0.01);
+}
+
+TEST_F(TensorPtrTest, InitializerListInt8ToInt64) {
+  auto tensor =
+      make_tensor_ptr<int8_t>({1, -2, 3, -4}, exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long);
+
+  auto data_ptr = tensor->const_data_ptr<int64_t>();
+  EXPECT_EQ(data_ptr[0], 1);
+  EXPECT_EQ(data_ptr[1], -2);
+  EXPECT_EQ(data_ptr[2], 3);
+  EXPECT_EQ(data_ptr[3], -4);
+}
diff --git a/extension/testing_util/temp_file.h b/extension/testing_util/temp_file.h
index a710e130e7f..aa8f5bcc82e 100644
--- a/extension/testing_util/temp_file.h
+++ b/extension/testing_util/temp_file.h
@@ -18,9 +18,9 @@
 
 #include <gtest/gtest.h>
 
-namespace torch {
-namespace executor {
-namespace testing {
+namespace executorch {
+namespace extension {
+namespace testing { // Test-only helpers belong in a "testing" sub-namespace.
 
 /**
  * Creates and manages a named temporary file in the file system. Deletes the
@@ -98,6 +98,16 @@ class TempFile {
   std::string path_;
 };
 
+} // namespace testing
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace testing {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::testing::TempFile;
 } // namespace testing
 } // namespace executor
 } // namespace torch
diff --git a/extension/testing_util/test/temp_file_test.cpp b/extension/testing_util/test/temp_file_test.cpp
index 072630897b2..2a666d24d51 100644
--- a/extension/testing_util/test/temp_file_test.cpp
+++ b/extension/testing_util/test/temp_file_test.cpp
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using torch::executor::testing::TempFile;
+using executorch::extension::testing::TempFile;
 
 TEST(TempFileTest, Smoke) {
   std::string path;
diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt
new file mode 100644
index 00000000000..281b63b8592
--- /dev/null
+++ b/extension/threadpool/CMakeLists.txt
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+add_library(
+  extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp
+)
+target_link_libraries(
+  extension_threadpool PUBLIC executorch_no_prim_ops cpuinfo pthreadpool
+)
+target_include_directories(extension_threadpool PUBLIC ${EXECUTORCH_ROOT}/..)
+target_include_directories(
+  extension_threadpool
+  PUBLIC ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
+         ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
+)
+target_compile_options(extension_threadpool PUBLIC ${_common_compile_options})
+
+# Install libraries
+install(
+  TARGETS extension_threadpool
+  DESTINATION lib
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
diff --git a/extension/threadpool/TARGETS b/extension/threadpool/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/extension/threadpool/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/xnnpack/threadpool/cpuinfo_utils.cpp b/extension/threadpool/cpuinfo_utils.cpp
similarity index 100%
rename from backends/xnnpack/threadpool/cpuinfo_utils.cpp
rename to extension/threadpool/cpuinfo_utils.cpp
diff --git a/backends/xnnpack/threadpool/cpuinfo_utils.h b/extension/threadpool/cpuinfo_utils.h
similarity index 100%
rename from backends/xnnpack/threadpool/cpuinfo_utils.h
rename to extension/threadpool/cpuinfo_utils.h
diff --git a/backends/xnnpack/threadpool/targets.bzl b/extension/threadpool/targets.bzl
similarity index 100%
rename from backends/xnnpack/threadpool/targets.bzl
rename to extension/threadpool/targets.bzl
diff --git a/extension/threadpool/test/TARGETS b/extension/threadpool/test/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/extension/threadpool/test/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/xnnpack/threadpool/test/targets.bzl b/extension/threadpool/test/targets.bzl
similarity index 89%
rename from backends/xnnpack/threadpool/test/targets.bzl
rename to extension/threadpool/test/targets.bzl
index 7bbcd8c4c03..b8a39d8969a 100644
--- a/backends/xnnpack/threadpool/test/targets.bzl
+++ b/extension/threadpool/test/targets.bzl
@@ -15,6 +15,6 @@ def define_common_targets():
         name = "threadpool_test",
         srcs = _THREADPOOL_TESTS,
         deps = [
-            "//executorch/backends/xnnpack/threadpool:threadpool",
+            "//executorch/extension/threadpool:threadpool",
         ],
     )
diff --git a/backends/xnnpack/threadpool/test/threadpool_test.cpp b/extension/threadpool/test/threadpool_test.cpp
similarity index 97%
rename from backends/xnnpack/threadpool/test/threadpool_test.cpp
rename to extension/threadpool/test/threadpool_test.cpp
index a63a264e27b..c244b8fcf23 100644
--- a/backends/xnnpack/threadpool/test/threadpool_test.cpp
+++ b/extension/threadpool/test/threadpool_test.cpp
@@ -11,8 +11,8 @@
 #include <numeric>
 #include <random>
 
-#include <executorch/backends/xnnpack/threadpool/threadpool.h>
-#include <executorch/backends/xnnpack/threadpool/threadpool_guard.h>
+#include <executorch/extension/threadpool/threadpool.h>
+#include <executorch/extension/threadpool/threadpool_guard.h>
 
 using namespace ::testing;
 
diff --git a/backends/xnnpack/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp
similarity index 96%
rename from backends/xnnpack/threadpool/threadpool.cpp
rename to extension/threadpool/threadpool.cpp
index 4757ebf64c5..e8f2ea5f704 100644
--- a/backends/xnnpack/threadpool/threadpool.cpp
+++ b/extension/threadpool/threadpool.cpp
@@ -6,14 +6,15 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/xnnpack/threadpool/threadpool.h>
-#include <executorch/backends/xnnpack/threadpool/threadpool_guard.h>
+#include <executorch/extension/threadpool/threadpool.h>
+#include <executorch/extension/threadpool/threadpool_guard.h>
 #include <executorch/runtime/platform/assert.h>
 #include <algorithm>
 
 #include <cpuinfo.h>
 
 #include <atomic>
+#include <memory>
 
 namespace torch {
 namespace executorch {
diff --git a/backends/xnnpack/threadpool/threadpool.h b/extension/threadpool/threadpool.h
similarity index 100%
rename from backends/xnnpack/threadpool/threadpool.h
rename to extension/threadpool/threadpool.h
diff --git a/backends/xnnpack/threadpool/threadpool_guard.cpp b/extension/threadpool/threadpool_guard.cpp
similarity index 89%
rename from backends/xnnpack/threadpool/threadpool_guard.cpp
rename to extension/threadpool/threadpool_guard.cpp
index a7f2a1803db..ac4103fbbc7 100644
--- a/backends/xnnpack/threadpool/threadpool_guard.cpp
+++ b/extension/threadpool/threadpool_guard.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/xnnpack/threadpool/threadpool_guard.h>
+#include <executorch/extension/threadpool/threadpool_guard.h>
 
 namespace torch {
 namespace executorch {
diff --git a/backends/xnnpack/threadpool/threadpool_guard.h b/extension/threadpool/threadpool_guard.h
similarity index 100%
rename from backends/xnnpack/threadpool/threadpool_guard.h
rename to extension/threadpool/threadpool_guard.h
diff --git a/extension/training/CMakeLists.txt b/extension/training/CMakeLists.txt
new file mode 100644
index 00000000000..da6ba7f1b43
--- /dev/null
+++ b/extension/training/CMakeLists.txt
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+list(TRANSFORM _extension_training__srcs PREPEND "${EXECUTORCH_ROOT}/")
+
+add_library(extension_training ${_extension_training__srcs})
+target_include_directories(
+  extension_training PUBLIC ${_common_include_directories}
+)
+
+target_include_directories(extension_training PUBLIC ${EXECUTORCH_ROOT}/..)
+target_compile_options(extension_training PUBLIC ${_common_compile_options})
+target_link_libraries(extension_training executorch_no_prim_ops
+    extension_data_loader extension_module extension_tensor)
+
+
+list(TRANSFORM _train_xor__srcs PREPEND "${EXECUTORCH_ROOT}/")
+add_executable(train_xor ${_train_xor__srcs})
+target_include_directories(
+  train_xor PUBLIC ${_common_include_directories}
+)
+target_link_libraries(
+train_xor gflags executorch_no_prim_ops portable_ops_lib extension_tensor
+    extension_training program_schema
+)
+target_compile_options(train_xor PUBLIC ${_common_compile_options})
+
+# Install libraries
+install(
+  TARGETS extension_training
+  DESTINATION lib
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
diff --git a/extension/training/README.md b/extension/training/README.md
new file mode 100644
index 00000000000..17e5f91f075
--- /dev/null
+++ b/extension/training/README.md
@@ -0,0 +1,277 @@
+# ExecuTorch On-device Training
+
+This subtree contains infrastructure to facilitate on-device training using ExecuTorch.
+This feature is experimental and under heavy active development, all the APIs are
+subject to change and many things may not work out of the box or at all in the
+current state.
+
+## Layout
+- `examples/` : Example end to end flows from model definition to optimizer.step()
+- `module/`: Utility class to provide an improved UX when using ExecuTorch for Training.
+- `optimizer/`: Cpp implementations of various optimizers, currently only SGD though Adam is planned.
+- `test/`: Tests that cover multiple subdirs.
+
+## Technical Birds Eye view
+
+At a high level ExecuTorch training follows a similar flow to inference with a few extra steps.
+
+Instead of relying on autograd at runtime to dynamically generate the backward graph and then walk it,
+we capture the backward graph ahead of time. This lets us be a lot leaner on-device as well as
+letting backends have more direct control over more of the model execution. Currently the optimizer is not
+captured though this may change over time.
+
+Loss functions must be embedded inside the model definition (and be the first output) this is used during
+capture to generate the backwards graph.
+
+Gradients become explicit graph outputs rather then hidden tensor state.
+
+Since the weights now need to be mutable during execution, they are memory planned ahead of time and copied
+from the .pte into the HeirarchicalAllocator arenas during Method init.
+
+Integration with backends/delegates is still a work in progress.
+
+
+## End to End Example
+
+To further understand the features of ExecuTorch Training and how to leverage it,
+consider the following end to end example with a neural network learning the XOR function.
+
+### Lowering a joint-graph model to ExecuTorch
+
+After following the [setting up ExecuTorch] guide. You can run
+
+```bash
+python3 extension/training/examples/XOR/export_model.py --outdir /tmp/foobar
+```
+to generate the model file. Below is a walkthrough of how that script works.
+
+First lets define our model.
+```python
+import torch.nn as nn
+from torch.nn import functional as F
+
+from torch.export import export
+from torch.export.experimental import _export_forward_backward
+
+
+# Basic Net for XOR
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(2, 10)
+        self.linear2 = nn.Linear(10, 2)
+
+    def forward(self, x):
+        return self.linear2(F.sigmoid(self.linear(x)))
+```
+
+The first big difference from the normal ExecuTorch flow is that for training we must embed
+the loss function into model and return the loss as our first output.
+
+We don't want to modify the original model definition so we will just wrap it.
+
+```python
+class TrainingNet(nn.Module):
+    def __init__(self, net):
+        super().__init__()
+        self.net = net
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, input, label):
+        pred = self.net(input)
+        return self.loss(pred, label), pred.detach().argmax(dim=1)
+```
+
+Now that we have our model we can lower it to ExecuTorch. To do that we just have to follow
+a few simple steps.
+
+```python
+net = TrainingNet(Net())
+
+# Create our inputs, only the shapes of these matter.
+input = torch.randn(1, 2)
+label = torch.ones(1, dtype=torch.int64)
+
+# Captures the forward graph. The graph will look similar to the model definition now.
+# Will move to export_for_training soon which is the api planned to be supported in the long term.
+ep = export(net, (input, label))
+```
+
+This is what the graph looks like after export
+```python
+>>>print(ep.graph_module.graph)
+
+graph():
+    %p_net_linear_weight : [num_users=1] = placeholder[target=p_net_linear_weight]
+    %p_net_linear_bias : [num_users=1] = placeholder[target=p_net_linear_bias]
+    %p_net_linear2_weight : [num_users=1] = placeholder[target=p_net_linear2_weight]
+    %p_net_linear2_bias : [num_users=1] = placeholder[target=p_net_linear2_bias]
+    %input : [num_users=1] = placeholder[target=input]
+    %label : [num_users=1] = placeholder[target=label]
+    %linear : [num_users=1] = call_function[target=torch.ops.aten.linear.default](args = (%input, %p_net_linear_weight, %p_net_linear_bias), kwargs = {})
+    %sigmoid : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%linear,), kwargs = {})
+    %linear_1 : [num_users=2] = call_function[target=torch.ops.aten.linear.default](args = (%sigmoid, %p_net_linear2_weight, %p_net_linear2_bias), kwargs = {})
+    %cross_entropy_loss : [num_users=1] = call_function[target=torch.ops.aten.cross_entropy_loss.default](args = (%linear_1, %label), kwargs = {})
+    %detach : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%linear_1,), kwargs = {})
+    %argmax : [num_users=1] = call_function[target=torch.ops.aten.argmax.default](args = (%detach, 1), kwargs = {})
+    return (cross_entropy_loss, argmax)
+```
+
+It should look pretty similar to our model's forward function. Now we need to capture the backwards graph.
+
+```python
+ep = _export_forward_backward(ep)
+```
+
+and now the graph is
+
+```python
+>>>print(ep.graph_module.graph)
+
+graph():
+    %p_net_linear_weight : [num_users=1] = placeholder[target=p_net_linear_weight]
+    %p_net_linear_bias : [num_users=1] = placeholder[target=p_net_linear_bias]
+    %p_net_linear2_weight : [num_users=1] = placeholder[target=p_net_linear2_weight]
+    %p_net_linear2_bias : [num_users=1] = placeholder[target=p_net_linear2_bias]
+    %input : [num_users=2] = placeholder[target=input]
+    %label : [num_users=5] = placeholder[target=label]
+    %permute : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%p_net_linear_weight, [1, 0]), kwargs = {})
+    %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%p_net_linear_bias, %input, %permute), kwargs = {})
+    %sigmoid : [num_users=3] = call_function[target=torch.ops.aten.sigmoid.default](args = (%addmm,), kwargs = {})
+    %alias : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%sigmoid,), kwargs = {})
+    %alias_1 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias,), kwargs = {})
+    %permute_1 : [num_users=2] = call_function[target=torch.ops.aten.permute.default](args = (%p_net_linear2_weight, [1, 0]), kwargs = {})
+    %addmm_1 : [num_users=2] = call_function[target=torch.ops.aten.addmm.default](args = (%p_net_linear2_bias, %sigmoid, %permute_1), kwargs = {})
+    %_log_softmax : [num_users=3] = call_function[target=torch.ops.aten._log_softmax.default](args = (%addmm_1, 1, False), kwargs = {})
+    %alias_2 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%_log_softmax,), kwargs = {})
+    %alias_3 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias_2,), kwargs = {})
+    %ne : [num_users=1] = call_function[target=torch.ops.aten.ne.Scalar](args = (%label, -100), kwargs = {})
+    %scalar_tensor : [num_users=1] = call_function[target=torch.ops.aten.scalar_tensor.default](args = (0,), kwargs = {dtype: torch.int64, layout: torch.strided, device: cpu})
+    %where : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne, %label, %scalar_tensor), kwargs = {})
+    %unsqueeze : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%where, 1), kwargs = {})
+    %gather : [num_users=1] = call_function[target=torch.ops.aten.gather.default](args = (%_log_softmax, 1, %unsqueeze), kwargs = {})
+    %squeeze : [num_users=1] = call_function[target=torch.ops.aten.squeeze.dims](args = (%gather, [1]), kwargs = {})
+    %neg : [num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%squeeze,), kwargs = {})
+    %ne_1 : [num_users=1] = call_function[target=torch.ops.aten.ne.Scalar](args = (%label, -100), kwargs = {})
+    %scalar_tensor_1 : [num_users=1] = call_function[target=torch.ops.aten.scalar_tensor.default](args = (0,), kwargs = {dtype: torch.float32, layout: torch.strided, device: cpu})
+    %where_1 : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne_1, %neg, %scalar_tensor_1), kwargs = {})
+    %ne_2 : [num_users=1] = call_function[target=torch.ops.aten.ne.Scalar](args = (%label, -100), kwargs = {})
+    %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%ne_2, []), kwargs = {})
+    %_to_copy : [num_users=2] = call_function[target=torch.ops.aten._to_copy.default](args = (%sum_1,), kwargs = {dtype: torch.float32, device: cpu})
+    %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%where_1, []), kwargs = {})
+    %div : [num_users=2] = call_function[target=torch.ops.aten.div.Tensor](args = (%sum_2, %_to_copy), kwargs = {})
+    %alias_4 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%addmm_1,), kwargs = {})
+    %alias_5 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias_4,), kwargs = {})
+    %alias_6 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias_5,), kwargs = {})
+    %argmax : [num_users=1] = call_function[target=torch.ops.aten.argmax.default](args = (%alias_6, 1), kwargs = {})
+    %full_like : [num_users=1] = call_function[target=torch.ops.aten.full_like.default](args = (%div, 1), kwargs = {pin_memory: False, memory_format: torch.preserve_format})
+    %div_1 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%full_like, %_to_copy), kwargs = {})
+    %unsqueeze_1 : [num_users=3] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%label, 1), kwargs = {})
+    %ne_3 : [num_users=1] = call_function[target=torch.ops.aten.ne.Scalar](args = (%unsqueeze_1, -100), kwargs = {})
+    %scalar_tensor_2 : [num_users=1] = call_function[target=torch.ops.aten.scalar_tensor.default](args = (0,), kwargs = {dtype: torch.int64, layout: torch.strided, device: cpu})
+    %where_2 : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne_3, %unsqueeze_1, %scalar_tensor_2), kwargs = {})
+    %full_like_1 : [num_users=1] = call_function[target=torch.ops.aten.full_like.default](args = (%_log_softmax, 0), kwargs = {pin_memory: False, memory_format: torch.preserve_format})
+    %scatter : [num_users=1] = call_function[target=torch.ops.aten.scatter.value](args = (%full_like_1, 1, %where_2, -1.0), kwargs = {})
+    %ne_4 : [num_users=1] = call_function[target=torch.ops.aten.ne.Scalar](args = (%unsqueeze_1, -100), kwargs = {})
+    %scalar_tensor_3 : [num_users=1] = call_function[target=torch.ops.aten.scalar_tensor.default](args = (0,), kwargs = {dtype: torch.float32, layout: torch.strided, device: cpu})
+    %where_3 : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne_4, %div_1, %scalar_tensor_3), kwargs = {})
+    %mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%scatter, %where_3), kwargs = {})
+    %alias_7 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias_3,), kwargs = {})
+    %alias_8 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias_7,), kwargs = {})
+    %exp : [num_users=1] = call_function[target=torch.ops.aten.exp.default](args = (%alias_8,), kwargs = {})
+    %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul, [1], True), kwargs = {})
+    %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%exp, %sum_3), kwargs = {})
+    %sub : [num_users=3] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul, %mul_1), kwargs = {})
+    %permute_2 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_1, [1, 0]), kwargs = {})
+    %mm : [num_users=1] = call_function[target=torch.ops.aten.mm.default](args = (%sub, %permute_2), kwargs = {})
+    %permute_3 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%sub, [1, 0]), kwargs = {})
+    %mm_1 : [num_users=1] = call_function[target=torch.ops.aten.mm.default](args = (%permute_3, %sigmoid), kwargs = {})
+    %permute_4 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%mm_1, [1, 0]), kwargs = {})
+    %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%sub, [0], True), kwargs = {})
+    %view : [num_users=1] = call_function[target=torch.ops.aten.view.default](args = (%sum_4, [2]), kwargs = {})
+    %permute_5 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_4, [1, 0]), kwargs = {})
+    %alias_9 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias_1,), kwargs = {})
+    %alias_10 : [num_users=2] = call_function[target=torch.ops.aten.alias.default](args = (%alias_9,), kwargs = {})
+    %sub_1 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1, %alias_10), kwargs = {})
+    %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%alias_10, %sub_1), kwargs = {})
+    %mul_3 : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm, %mul_2), kwargs = {})
+    %permute_6 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%mul_3, [1, 0]), kwargs = {})
+    %mm_2 : [num_users=1] = call_function[target=torch.ops.aten.mm.default](args = (%permute_6, %input), kwargs = {})
+    %permute_7 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%mm_2, [1, 0]), kwargs = {})
+    %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_3, [0], True), kwargs = {})
+    %view_1 : [num_users=1] = call_function[target=torch.ops.aten.view.default](args = (%sum_5, [10]), kwargs = {})
+    %permute_8 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_7, [1, 0]), kwargs = {})
+    return (div, argmax, permute_8, view_1, permute_5, view)
+```
+
+Its a lot bigger! We call this the 'joint graph' or the 'forwards backwards graph'. We have explicitly captured the backwards graph
+alongside the forward and now our model returns [Loss, Any other user outputs, Gradients].
+
+From here we can lower the rest of the way to ExecuTorch
+```python
+ep = to_edge(ep)
+
+# After calling to_executorch the weights themselves are also appended to the model outputs. This is to make
+# some downstream passes like memory planning a little easier. A couple of hidden utility functions are also
+# embedded in the model __et_training_gradients_index_<method_name>,
+# __et_training_parameters_index_<method_name>, __et_training_fqn_<method_name>.
+#
+# These help us partition the huge list of model outputs into meaningful sections as well as assign names to each weight/gradient.
+ep = ep.to_executorch()
+
+with open("xor.pte", "wb") as file:
+    ep.write_to_file(file)
+```
+
+### Run the model train script with CMAKE
+After exporting the model for training, we can now try learning using CMake. We can build and use the train_xor, which is a sample wrapper for the ExecuTorch Runtime, TrainingModule, and SGD optimizer. We first begin by configuring the CMake build like such:
+```bash
+# cd to the root of executorch repo
+cd executorch
+
+# Get a clean cmake-out directory
+rm -rf cmake-out
+mkdir cmake-out
+
+# Configure cmake
+cmake \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TRAINING=ON \
+    -DEXECUTORCH_ENABLE_LOGGING=ON \
+    -DPYTHON_EXECUTABLE=python \
+    -Bcmake-out .
+```
+Then you can build the runtime componenets with
+
+```bash
+cmake --build cmake-out -j9 --target install --config Release
+```
+
+Now you should be able to find the executable built at `./cmake-out/extension/training/train_xor` you can run the executable with the model you generated as such
+```bash
+./cmake-out/extension/training/train_xor --model_path=./xor.pte
+```
+
+## What is missing?/ What is next?
+A ton! ExecuTorch training is still quite experimental and under heavy active development. Whats here currently is more of a technical preview.
+
+The _export_forward_backward is not very stable yet and may fail on more complicated model architectures, though we have verified it works for LoRA with LLMs.
+
+The ExecuTorch portable operator lib does not yet have full coverage of ops that might show up in the backwards graphs.
+
+We don't have a way yet to serialize the newly trained weights natively in ExecuTorch (though you can convert them to ATen tensors using extension/aten_util and then serialize them using ATen APIs).
+
+We plan to add a way to update models in place on-device (will be needed for finetuning).
+
+We are looking to integrate with many of the existing delegates/backends on ET enabling accelerated training.
+
+and so much more!
+
+## Help & Improvements
+If you have problems or questions, or have suggestions for ways to make
+implementation and testing better, please reach out to the PyTorch Edge team or
+create an issue on [github](https://www.github.com/pytorch/executorch/issues).
diff --git a/extension/training/__init__.py b/extension/training/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/extension/training/examples/XOR/TARGETS b/extension/training/examples/XOR/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/extension/training/examples/XOR/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/training/examples/XOR/export_model.py b/extension/training/examples/XOR/export_model.py
new file mode 100644
index 00000000000..3089cea211e
--- /dev/null
+++ b/extension/training/examples/XOR/export_model.py
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import argparse
+
+import os
+
+import torch
+from executorch.exir import to_edge
+
+from executorch.extension.training.examples.XOR.model import Net, TrainingNet
+from torch.export import export
+from torch.export.experimental import _export_forward_backward
+
+
+def main() -> None:
+    torch.manual_seed(0)
+    parser = argparse.ArgumentParser(
+        prog="export_model",
+        description="Exports an nn.Module model to ExecuTorch .pte files",
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        required=True,
+        help="Path to the directory to write xor.pte files to",
+    )
+    args = parser.parse_args()
+
+    net = TrainingNet(Net())
+    x = torch.randn(1, 2)
+
+    # Captures the forward graph. The graph will look similar to the model definition now.
+    # Will move to export_for_training soon which is the api planned to be supported in the long term.
+    ep = export(net, (x, torch.ones(1, dtype=torch.int64)))
+    # Captures the backward graph. The exported_program now contains the joint forward and backward graph.
+    ep = _export_forward_backward(ep)
+    # Lower the graph to edge dialect.
+    ep = to_edge(ep)
+    # Lower the graph to executorch.
+    ep = ep.to_executorch()
+
+    # Write out the .pte file.
+    os.makedirs(args.outdir, exist_ok=True)
+    outfile = os.path.join(args.outdir, "xor.pte")
+    with open(outfile, "wb") as fp:
+        fp.write(
+            ep.buffer,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extension/training/examples/XOR/model.py b/extension/training/examples/XOR/model.py
new file mode 100644
index 00000000000..3c84238e7c5
--- /dev/null
+++ b/extension/training/examples/XOR/model.py
@@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import torch.nn as nn
+from torch.nn import functional as F
+
+
+# Basic Net for XOR
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(2, 10)
+        self.linear2 = nn.Linear(10, 2)
+
+    def forward(self, x):
+        return self.linear2(F.sigmoid(self.linear(x)))
+
+
+# On device training requires the loss to be embedded in the model (and be the first output).
+# We wrap the original model here and add the loss calculation. This will be the model we export.
+class TrainingNet(nn.Module):
+    def __init__(self, net):
+        super().__init__()
+        self.net = net
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, input, label):
+        pred = self.net(input)
+        return self.loss(pred, label), pred.detach().argmax(dim=1)
diff --git a/extension/training/examples/XOR/targets.bzl b/extension/training/examples/XOR/targets.bzl
new file mode 100644
index 00000000000..ccd7f4bf6f8
--- /dev/null
+++ b/extension/training/examples/XOR/targets.bzl
@@ -0,0 +1,51 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.cxx_binary(
+        name = "train_xor",
+        srcs = ["train.cpp"],
+        deps = [
+            "//executorch/extension/training/module:training_module",
+            "//executorch/extension/tensor:tensor",
+            "//executorch/extension/training/optimizer:sgd",
+            "//executorch/runtime/executor:program",
+            "//executorch/extension/data_loader:file_data_loader",
+            "//executorch/kernels/portable:generated_lib",
+        ],
+        external_deps = ["gflags"],
+        define_static_target = True,
+    )
+
+    runtime.python_library(
+        name = "model",
+        srcs = ["model.py"],
+        visibility = [],  # Private
+        deps = [
+            "//caffe2:torch",
+        ],
+    )
+
+    runtime.python_library(
+        name = "export_model_lib",
+        srcs = ["export_model.py"],
+        visibility = [],
+        deps = [
+            ":model",
+            "//caffe2:torch",
+            "//executorch/exir:lib",
+        ],
+    )
+
+    runtime.python_binary(
+        name = "export_model",
+        main_module = "executorch.extension.training.examples.XOR.export_model",
+        deps = [
+            ":export_model_lib",
+        ],
+    )
diff --git a/extension/training/examples/XOR/train.cpp b/extension/training/examples/XOR/train.cpp
new file mode 100644
index 00000000000..26ab3f9c67a
--- /dev/null
+++ b/extension/training/examples/XOR/train.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/extension/training/module/training_module.h>
+#include <executorch/extension/training/optimizer/sgd.h>
+#include <gflags/gflags.h>
+#include <random>
+
+#pragma clang diagnostic ignored \
+    "-Wbraced-scalar-init" // {0} below upsets clang.
+
+using executorch::extension::FileDataLoader;
+using executorch::extension::training::optimizer::SGD;
+using executorch::extension::training::optimizer::SGDOptions;
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+DEFINE_string(model_path, "xor.pte", "Model serialized in flatbuffer format.");
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (argc != 1) {
+    std::string msg = "Extra commandline args: ";
+    for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
+      msg += argv[i];
+    }
+    ET_LOG(Error, "%s", msg.c_str());
+    return 1;
+  }
+
+  // Load the model file.
+  executorch::runtime::Result<executorch::extension::FileDataLoader>
+      loader_res =
+          executorch::extension::FileDataLoader::from(FLAGS_model_path.c_str());
+  if (loader_res.error() != Error::Ok) {
+    ET_LOG(Error, "Failed to open model file: %s", FLAGS_model_path.c_str());
+    return 1;
+  }
+  auto loader = std::make_unique<executorch::extension::FileDataLoader>(
+      std::move(loader_res.get()));
+
+  auto mod = executorch::extension::training::TrainingModule(std::move(loader));
+
+  // Create full data set of input and labels.
+  std::vector<std::pair<
+      executorch::extension::TensorPtr,
+      executorch::extension::TensorPtr>>
+      data_set;
+  data_set.push_back( // XOR(1, 1) = 0
+      {executorch::extension::make_tensor_ptr<float>({1, 2}, {1, 1}),
+       executorch::extension::make_tensor_ptr<int64_t>({1}, {0})});
+  data_set.push_back( // XOR(0, 0) = 0
+      {executorch::extension::make_tensor_ptr<float>({1, 2}, {0, 0}),
+       executorch::extension::make_tensor_ptr<int64_t>({1}, {0})});
+  data_set.push_back( // XOR(1, 0) = 1
+      {executorch::extension::make_tensor_ptr<float>({1, 2}, {1, 0}),
+       executorch::extension::make_tensor_ptr<int64_t>({1}, {1})});
+  data_set.push_back( // XOR(0, 1) = 1
+      {executorch::extension::make_tensor_ptr<float>({1, 2}, {0, 1}),
+       executorch::extension::make_tensor_ptr<int64_t>({1}, {1})});
+
+  // Create optimizer.
+  // Get the params and names
+  auto param_res = mod.named_parameters("forward");
+  if (param_res.error() != Error::Ok) {
+    ET_LOG(Error, "Failed to get named parameters");
+    return 1;
+  }
+
+  SGDOptions options{0.1};
+  SGD optimizer(param_res.get(), options);
+
+  // Randomness to sample the data set.
+  std::default_random_engine URBG{std::random_device{}()};
+  std::uniform_int_distribution<int> dist{
+      0, static_cast<int>(data_set.size()) - 1};
+
+  // Train the model.
+  size_t num_epochs = 5000;
+  for (int i = 0; i < num_epochs; i++) {
+    int index = dist(URBG);
+    auto& data = data_set[index];
+    const auto& results = mod.execute_forward_backward(
+        "forward", {*data.first.get(), *data.second.get()});
+    if (results.error() != Error::Ok) {
+      ET_LOG(Error, "Failed to execute forward_backward");
+      return 1;
+    }
+    if (i % 500 == 0 || i == num_epochs - 1) {
+      ET_LOG(
+          Info,
+          "Step %d, Loss %f, Input [%.0f, %.0f], Prediction %ld, Label %ld",
+          i,
+          results.get()[0].toTensor().const_data_ptr<float>()[0],
+          data.first->const_data_ptr<float>()[0],
+          data.first->const_data_ptr<float>()[1],
+          results.get()[1].toTensor().const_data_ptr<int64_t>()[0],
+          data.second->const_data_ptr<int64_t>()[0]);
+    }
+    optimizer.step(mod.named_gradients("forward").get());
+  }
+}
diff --git a/extension/training/module/TARGETS b/extension/training/module/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/extension/training/module/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/training/module/targets.bzl b/extension/training/module/targets.bzl
new file mode 100644
index 00000000000..88da84ed131
--- /dev/null
+++ b/extension/training/module/targets.bzl
@@ -0,0 +1,28 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    for aten_mode in (True, False):
+        aten_suffix = ("_aten" if aten_mode else "")
+
+        runtime.cxx_library(
+            name = "training_module" + aten_suffix,
+            srcs = [
+                "training_module.cpp",
+            ],
+            exported_headers = [
+                "training_module.h",
+            ],
+            visibility = [
+                "@EXECUTORCH_CLIENTS",
+            ],
+            exported_deps = [
+                "//executorch/extension/module:module" + aten_suffix,
+                "//executorch/runtime/core:evalue" + aten_suffix,
+            ],
+        )
diff --git a/extension/training/module/test/TARGETS b/extension/training/module/test/TARGETS
new file mode 100644
index 00000000000..a6c52d105f6
--- /dev/null
+++ b/extension/training/module/test/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets(is_fbcode = True)
diff --git a/extension/training/module/test/targets.bzl b/extension/training/module/test/targets.bzl
new file mode 100644
index 00000000000..8b260e2a7e8
--- /dev/null
+++ b/extension/training/module/test/targets.bzl
@@ -0,0 +1,34 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets(is_fbcode = False):
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    # TODO(dbort): Find a way to make these run for ANDROID/APPLE in xplat. The
+    # android and ios test determinators don't like the reference to the model
+    # file in fbcode. See https://fburl.com/9esapdmd
+    if not runtime.is_oss and is_fbcode:
+        modules_env = {
+            # The tests use this var to find the program file to load. This uses
+            # an fbcode target path because the authoring/export tools
+            # intentionally don't work in xplat (since they're host-only tools).
+            "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
+            "ET_MODULE_SIMPLE_TRAIN_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleSimpleTrain.pte])",
+        }
+
+        runtime.cxx_test(
+            name = "training_module_test",
+            srcs = [
+                "training_module_test.cpp",
+            ],
+            deps = [
+                "//executorch/extension/training/module:training_module",
+                "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+                "//executorch/kernels/portable:generated_lib",
+            ],
+            env = modules_env,
+        )
diff --git a/extension/training/module/test/training_module_test.cpp b/extension/training/module/test/training_module_test.cpp
new file mode 100644
index 00000000000..58631c4cf44
--- /dev/null
+++ b/extension/training/module/test/training_module_test.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/training/module/training_module.h>
+
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gtest/gtest.h>
+
+// @lint-ignore-every CLANGTIDY facebook-hte-CArray
+
+using namespace ::testing;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using torch::executor::Error;
+using torch::executor::Span;
+using torch::executor::testing::TensorFactory;
+
+class TrainingModuleTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    torch::executor::runtime_init();
+  }
+};
+
+TEST_F(TrainingModuleTest, JointGraphTest) {
+  // Create a loader for the serialized ModuleAdd program.
+  const char* path = std::getenv("ET_MODULE_SIMPLE_TRAIN_PATH");
+  executorch::runtime::Result<torch::executor::util::FileDataLoader>
+      loader_res = torch::executor::util::FileDataLoader::from(path);
+  ASSERT_EQ(loader_res.error(), Error::Ok);
+  auto loader = std::make_unique<torch::executor::util::FileDataLoader>(
+      std::move(loader_res.get()));
+
+  auto mod = executorch::extension::training::TrainingModule(std::move(loader));
+
+  TensorFactory<ScalarType::Float> tf;
+  Tensor input = tf.make({3}, {1.0, 1.0, 1.0});
+  Tensor label = tf.make({3}, {1.0, 0.0, 0.0});
+
+  std::vector<executorch::runtime::EValue> inputs;
+  inputs.push_back(input);
+  inputs.push_back(label);
+
+  auto res = mod.execute_forward_backward("forward", inputs);
+  ASSERT_EQ(res.error(), Error::Ok);
+  ASSERT_EQ(res.get().size(), 1);
+
+  // Test Gradients
+  auto grad_res = mod.named_gradients("forward");
+  ASSERT_EQ(grad_res.error(), Error::Ok);
+  auto& grad = grad_res.get();
+  ASSERT_EQ(grad.size(), 2);
+  ASSERT_NE(grad.find("linear.weight"), grad.end());
+  ASSERT_NE(grad.find("linear.bias"), grad.end());
+
+  ASSERT_EQ(grad.find("linear.weight")->second.sizes()[0], 3);
+  ASSERT_EQ(grad.find("linear.weight")->second.sizes()[1], 3);
+  ASSERT_EQ(grad.find("linear.weight")->second.dim(), 2);
+  ASSERT_EQ(grad.find("linear.bias")->second.sizes()[0], 3);
+  ASSERT_EQ(grad.find("linear.bias")->second.dim(), 1);
+
+  // Test Parameters
+  auto param_res = mod.named_parameters("forward");
+  ASSERT_EQ(param_res.error(), Error::Ok);
+  auto& param = grad_res.get();
+  ASSERT_EQ(param.size(), 2);
+  ASSERT_NE(param.find("linear.weight"), grad.end());
+  ASSERT_NE(param.find("linear.bias"), grad.end());
+
+  ASSERT_EQ(param.find("linear.weight")->second.sizes()[0], 3);
+  ASSERT_EQ(param.find("linear.weight")->second.sizes()[1], 3);
+  ASSERT_EQ(param.find("linear.weight")->second.dim(), 2);
+  ASSERT_EQ(param.find("linear.bias")->second.sizes()[0], 3);
+  ASSERT_EQ(param.find("linear.bias")->second.dim(), 1);
+}
+
+TEST_F(TrainingModuleTest, NonTrainingModuleTest) {
+  // Create a loader for the serialized ModuleAdd program.
+  const char* path = std::getenv("ET_MODULE_ADD_PATH");
+  executorch::runtime::Result<torch::executor::util::FileDataLoader>
+      loader_res = torch::executor::util::FileDataLoader::from(path);
+  ASSERT_EQ(loader_res.error(), Error::Ok);
+  auto loader = std::make_unique<torch::executor::util::FileDataLoader>(
+      std::move(loader_res.get()));
+
+  auto mod = executorch::extension::training::TrainingModule(std::move(loader));
+
+  TensorFactory<ScalarType::Float> tf;
+  Tensor input = tf.make({2, 2}, {1.0, 1.0, 1.0, 1.0});
+  Tensor input2 = tf.make({2, 2}, {1.0, 0.0, 0.0, 0.0});
+
+  std::vector<executorch::runtime::EValue> inputs;
+  inputs.push_back(input);
+  inputs.push_back(input2);
+
+  // Non-training module should fail to execute forward/backward as it cant find
+  // the gradients or params.
+  auto res = mod.execute_forward_backward("forward", inputs);
+  ASSERT_EQ(res.error(), Error::InvalidArgument);
+}
diff --git a/extension/training/module/training_module.cpp b/extension/training/module/training_module.cpp
new file mode 100644
index 00000000000..28128552e2f
--- /dev/null
+++ b/extension/training/module/training_module.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/training/module/training_module.h>
+
+namespace executorch {
+namespace extension {
+namespace training {
+
+namespace {
+std::string gradients_method_prefix = "__et_training_gradients_index_";
+std::string parameters_method_prefix = "__et_training_parameters_index_";
+std::string fqn_method_prefix = "__et_training_fqn_";
+} // namespace
+
+runtime::Result<std::vector<runtime::EValue>>
+TrainingModule::execute_forward_backward(
+    const std::string& method_name,
+    const std::vector<runtime::EValue>& input) {
+  // Find where the user outputs end.
+  const std::string gradients_method_name =
+      gradients_method_prefix + method_name;
+  auto res = executorch::extension::Module::execute(gradients_method_name);
+  if (!res.ok()) {
+    return res.error();
+  }
+  uint64_t grad_start = res.get()[0].toInt();
+
+  const std::string parameters_method_name =
+      parameters_method_prefix + method_name;
+  // get params start.
+  auto param_res =
+      executorch::extension::Module::execute(parameters_method_name);
+  if (!param_res.ok()) {
+    return param_res.error();
+  }
+
+  uint64_t param_start = param_res.get()[0].toInt();
+
+  // Execute the forward and backward pass.
+
+  auto outputs = torch::executor::Module::execute(method_name, input);
+  if (!outputs.ok()) {
+    return outputs.error();
+  }
+
+  // Extract the user outputs.
+  std::vector<runtime::EValue> user_outputs;
+  user_outputs.reserve(grad_start);
+  for (size_t i = 0; i < grad_start; ++i) {
+    user_outputs.push_back(outputs.get().at(i));
+  }
+
+  // Extract and store the gradients.
+  if (method_named_gradients_.find(method_name) ==
+      method_named_gradients_.end()) {
+    method_named_gradients_.insert({method_name, {}});
+
+    auto& gradients_map = method_named_gradients_.at(method_name);
+    // Get names.
+    const std::string fqn_method_name = fqn_method_prefix + method_name;
+    auto fqn_res = executorch::extension::Module::execute(fqn_method_name);
+    if (!fqn_res.ok()) {
+      return fqn_res.error();
+    }
+    const auto& fqn_list = fqn_res.get();
+
+    // Only have to initialize the dict once because the tensors in the dict and
+    // the tensors in the method alias the same TensorImpl, so updating one will
+    // update the other.
+    size_t name_index = 0;
+    for (size_t grad_index = grad_start; grad_index < param_start;
+         ++grad_index, ++name_index) {
+      exec_aten::string_view fqn = fqn_list.at(name_index).toString();
+      gradients_map.insert({fqn, outputs.get().at(grad_index).toTensor()});
+    }
+  }
+
+  return user_outputs;
+}
+
+runtime::Result<const std::map<exec_aten::string_view, exec_aten::Tensor>>
+TrainingModule::named_parameters(const std::string& method_name) {
+  std::map<exec_aten::string_view, exec_aten::Tensor> named_parameters;
+  const std::string fqn_method_name = fqn_method_prefix + method_name;
+  const std::string parameters_method_name =
+      parameters_method_prefix + method_name;
+
+  // get names.
+  auto fqn_res = executorch::extension::Module::execute(fqn_method_name);
+  if (!fqn_res.ok()) {
+    return fqn_res.error();
+  }
+  const auto& fqn_list = fqn_res.get();
+
+  // get params start.
+  auto param_res =
+      executorch::extension::Module::execute(parameters_method_name);
+  if (!param_res.ok()) {
+    return param_res.error();
+  }
+
+  uint64_t param_start = param_res.get()[0].toInt();
+
+  auto e = executorch::extension::Module::load_method(method_name);
+  if (e != runtime::Error::Ok) {
+    return e;
+  }
+  auto& method = methods_.at(method_name).method;
+
+  // create dict
+  size_t name_index = 0;
+  for (size_t param_index = param_start; param_index < method->outputs_size();
+       ++param_index, ++name_index) {
+    exec_aten::string_view fqn = fqn_list.at(name_index).toString();
+    exec_aten::Tensor param = method->get_output(param_index).toTensor();
+    named_parameters.insert({fqn, param});
+  }
+  return named_parameters;
+}
+
+runtime::Result<const std::map<exec_aten::string_view, exec_aten::Tensor>>
+TrainingModule::named_gradients(const std::string& method_name) {
+  if (method_named_gradients_.find(method_name) ==
+      method_named_gradients_.end()) {
+    ET_LOG(Error, "No gradients found for method %s", method_name.c_str());
+    return executorch::runtime::Error::InvalidArgument;
+  }
+  return method_named_gradients_.at(method_name);
+}
+
+} // namespace training
+} // namespace extension
+} // namespace executorch
diff --git a/extension/training/module/training_module.h b/extension/training/module/training_module.h
new file mode 100644
index 00000000000..ade3f6f2f4f
--- /dev/null
+++ b/extension/training/module/training_module.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/executor/program.h>
+
+namespace executorch {
+namespace extension {
+namespace training {
+
+/**
+ * A facade class for loading programs for on-device training and executing
+ * methods within them.
+ */
+class ET_EXPERIMENTAL TrainingModule final : executorch::extension::Module {
+ public:
+  explicit TrainingModule(
+      std::unique_ptr<runtime::DataLoader> data_loader,
+      std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
+      std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr)
+      : executorch::extension::Module(
+            std::move(data_loader),
+            std::move(memory_allocator),
+            std::move(temp_allocator),
+            std::move(event_tracer)),
+        method_named_gradients_({}) {}
+
+  explicit TrainingModule(const Module&) = delete;
+  TrainingModule& operator=(const Module&) = delete;
+  explicit TrainingModule(Module&&) = delete;
+  TrainingModule& operator=(Module&&) = delete;
+
+  /**
+   * Execute a specific method with the given input and retrieve output. Only
+   * valid if the specified method is a joint graph. Loads the program and
+   * method before executing if needed.
+   *
+   * @param[in] method_name The name of the joint graph method to execute.
+   * @param[in] input A vector of input values to be passed to the method.
+   *
+   * @returns A Result object containing the output values from the method or an
+   * error to indicate failure.
+   */
+  ET_EXPERIMENTAL runtime::Result<std::vector<runtime::EValue>>
+  execute_forward_backward(
+      const std::string& method_name,
+      const std::vector<runtime::EValue>& input);
+
+  /**
+   * Retrieve the trainable parameters for a joint graph method.
+   *
+   * @param[in] method_name The name of the joint graph method to get the
+   * parameters for.
+   *
+   * @returns A Result object containing a map of the fully qualified name to
+   * parameter tensor, or an error if the method is not a joint graph.
+   */
+  ET_EXPERIMENTAL
+  runtime::Result<const std::map<exec_aten::string_view, exec_aten::Tensor>>
+  named_parameters(const std::string& method_name);
+
+  /**
+   * Retrieve the latest gradients for a joint graph method.
+   *
+   * @param[in] method_name The name of the joint graph method to get the
+   * gradients for.
+   *
+   * @returns A Result object containing a map of the fully qualified name to
+   * gradient tensor associated with that parameter from the latest
+   * forward_backward execution, or an error if the method is not a joint graph
+   * or has not been executed yet.
+   */
+  ET_EXPERIMENTAL
+  runtime::Result<const std::map<exec_aten::string_view, exec_aten::Tensor>>
+  named_gradients(const std::string& method_name);
+
+ private:
+  std::unordered_map<
+      std::string,
+      std::map<exec_aten::string_view, exec_aten::Tensor>>
+      method_named_gradients_;
+};
+
+} // namespace training
+} // namespace extension
+} // namespace executorch
diff --git a/extension/training/optimizer/sgd.cpp b/extension/training/optimizer/sgd.cpp
index f2f63523b48..383383abc3e 100644
--- a/extension/training/optimizer/sgd.cpp
+++ b/extension/training/optimizer/sgd.cpp
@@ -7,16 +7,49 @@
  */
 
 #include <executorch/extension/training/optimizer/sgd.h>
-#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
 
 #include <executorch/runtime/core/error.h>
-#include <executorch/runtime/kernel/kernel_runtime_context.h>
 
-namespace torch {
-namespace executor {
+using exec_aten::Tensor;
+using exec_aten::TensorImpl;
+using ::executorch::runtime::Error;
+
+namespace executorch {
+namespace extension {
 namespace training {
 namespace optimizer {
 
+namespace {
+void add_out_hack(
+    const Tensor& a,
+    const Tensor& b,
+    const double alpha,
+    Tensor& out) {
+  auto a_ptr = a.const_data_ptr<float>();
+  auto b_ptr = b.const_data_ptr<float>();
+  auto out_ptr = out.mutable_data_ptr<float>();
+  for (size_t i = 0; i < a.numel(); ++i) {
+    out_ptr[i] = a_ptr[i] + b_ptr[i] * alpha;
+  }
+}
+
+void mul_out_hack(const Tensor& a, const double alpha, Tensor& out) {
+  auto a_ptr = a.const_data_ptr<float>();
+  auto out_ptr = out.mutable_data_ptr<float>();
+  for (size_t i = 0; i < a.numel(); ++i) {
+    out_ptr[i] = a_ptr[i] * alpha;
+  }
+}
+
+void clone_out_hack(const Tensor& a, Tensor& out) {
+  auto a_ptr = a.const_data_ptr<float>();
+  auto out_ptr = out.mutable_data_ptr<float>();
+  for (size_t i = 0; i < a.numel(); ++i) {
+    out_ptr[i] = a_ptr[i];
+  }
+}
+} // namespace
+
 bool SGDParamGroup::has_options() const {
   return options_ != nullptr;
 }
@@ -33,25 +66,13 @@ void SGDParamGroup::set_options(std::unique_ptr<SGDOptions> options) {
   options_ = std::move(options);
 }
 
-Span<const char*> SGDParamGroup::param_names() {
-  return param_names_;
-}
-
-const Span<const char*> SGDParamGroup::param_names() const {
-  return param_names_;
-}
-
-Span<Tensor> SGDParamGroup::param_data() {
-  return param_data_;
-}
-
-const Span<Tensor> SGDParamGroup::param_data() const {
-  return param_data_;
+const std::map<exec_aten::string_view, exec_aten::Tensor>&
+SGDParamGroup::named_parameters() const {
+  return named_parameters_;
 }
 
 void SGD::add_param_group(const SGDParamGroup& param_group) {
-  SGDParamGroup param_group_(
-      param_group.param_names(), param_group.param_data());
+  SGDParamGroup param_group_(param_group.named_parameters());
   if (!param_group.has_options()) {
     param_group_.set_options(defaults_->clone());
   } else {
@@ -60,14 +81,8 @@ void SGD::add_param_group(const SGDParamGroup& param_group) {
   param_groups_.emplace_back(std::move(param_group_));
 }
 
-Error SGD::step(Span<const char*> gradient_names, Span<Tensor> gradient_data) {
-  // check that the number of gradient names matches the number of gradients
-  ET_CHECK_OR_RETURN_ERROR(
-      gradient_names.size() == gradient_data.size(),
-      InvalidState,
-      "Gradient names and gradients must have the same length.");
-
-  RuntimeContext context;
+Error SGD::step(const std::map<exec_aten::string_view, exec_aten::Tensor>&
+                    named_gradients) {
   for (auto& group : param_groups_) {
     auto& options = static_cast<SGDOptions&>(group.options());
     auto weight_decay = options.weight_decay();
@@ -75,86 +90,63 @@ Error SGD::step(Span<const char*> gradient_names, Span<Tensor> gradient_data) {
     auto dampening = options.dampening();
     auto nesterov = options.nesterov();
 
-    for (int i = 0; i < group.param_names().size(); i++) {
-      for (int j = 0; j < gradient_names.size(); j++) {
-        // if param name and gradient name match, run the optimizer step
-        if (strcmp(group.param_names()[i], gradient_names[j]) == 0) {
-          auto d_p = gradient_data[j];
-          auto p = group.param_data()[i];
-          if (weight_decay != 0) {
-            // uses weight_decay specified and adds it to the gradient
-            torch::executor::aten::add_outf(context, d_p, p, weight_decay, d_p);
-            if (context.failure_state() != Error::Ok) {
-              return context.failure_state();
-            }
-          }
-          if (momentum != 0) {
-            Tensor buf(nullptr);
-            auto param_state = state_.find(p.unsafeGetTensorImpl());
-            // look for the momentum buffer for the given parameter. this is the
-            // momentum as of the previous epoch
-            if (param_state == state_.end()) {
-              // create a new momentum buffer if it doesn't exist. this memory
-              // needs to be freed when the optimizer is destroyed
-              void* buf_ptr = malloc(d_p.nbytes());
+    for (auto param_iter = group.named_parameters().begin();
+         param_iter != group.named_parameters().end();
+         ++param_iter) {
+      // if param name and gradient name match, run the optimizer step
+      const auto& named_gradient = named_gradients.find(param_iter->first);
+      if (named_gradient != named_gradients.end()) {
+        auto d_p = named_gradient->second;
+        auto p = param_iter->second;
+        if (weight_decay != 0) {
+          // uses weight_decay specified and adds it to the gradient
+          add_out_hack(d_p, p, weight_decay, d_p);
+        }
+        if (momentum != 0) {
+          Tensor buf(nullptr);
+          auto param_state = state_.find(p.unsafeGetTensorImpl());
+          // look for the momentum buffer for the given parameter. this is the
+          // momentum as of the previous epoch
+          if (param_state == state_.end()) {
+            // create a new momentum buffer if it doesn't exist. this memory
+            // needs to be freed when the optimizer is destroyed
+            void* buf_ptr = malloc(d_p.nbytes());
 
 #ifdef USE_ATEN_LIB
-              std::vector<int64_t> sizes(
-                  d_p.sizes().begin(), d_p.sizes().end());
-              buf = torch::from_blob(buf_ptr, sizes, d_p.scalar_type());
+            std::vector<int64_t> sizes(d_p.sizes().begin(), d_p.sizes().end());
+            buf = torch::from_blob(buf_ptr, sizes, d_p.scalar_type());
 #else
-              TensorImpl* buf_impl = new TensorImpl(
-                  d_p.scalar_type(),
-                  d_p.sizes().size(),
-                  const_cast<TensorImpl::SizesType*>(d_p.sizes().data()),
-                  buf_ptr,
-                  const_cast<TensorImpl::DimOrderType*>(
-                      d_p.dim_order().data()));
-              buf = Tensor(buf_impl);
+            TensorImpl* buf_impl = new TensorImpl(
+                d_p.scalar_type(),
+                d_p.sizes().size(),
+                const_cast<TensorImpl::SizesType*>(d_p.sizes().data()),
+                buf_ptr,
+                const_cast<TensorImpl::DimOrderType*>(d_p.dim_order().data()));
+            buf = Tensor(buf_impl);
 #endif
-              torch::executor::aten::clone_outf(
-                  context, d_p, exec_aten::MemoryFormat::Contiguous, buf);
-              if (context.failure_state() != Error::Ok) {
-                return context.failure_state();
-              }
-
-              // save the state of the momentum buffer to be reused in later
-              // epochs
-              auto state = std::make_unique<SGDParamState>(buf);
-              state_[p.unsafeGetTensorImpl()] = std::move(state);
-            } else {
-              buf = static_cast<SGDParamState&>(*param_state->second)
-                        .momentum_buffer();
-
-              // update the momentum buffer and apply dampening
-              torch::executor::aten::mul_outf(context, buf, momentum, buf);
-              if (context.failure_state() != Error::Ok) {
-                return context.failure_state();
-              }
-              torch::executor::aten::add_outf(
-                  context, buf, d_p, 1 - dampening, buf);
-              if (context.failure_state() != Error::Ok) {
-                return context.failure_state();
-              }
-            }
-            if (nesterov) {
-              // apply nesterov momentum
-              torch::executor::aten::add_outf(context, d_p, buf, momentum, d_p);
-              if (context.failure_state() != Error::Ok) {
-                return context.failure_state();
-              }
-            } else {
-              d_p = buf;
-            }
+            clone_out_hack(d_p, buf);
+
+            // save the state of the momentum buffer to be reused in later
+            // epochs
+            auto state = std::make_unique<SGDParamState>(buf);
+            state_[p.unsafeGetTensorImpl()] = std::move(state);
+          } else {
+            buf = static_cast<SGDParamState&>(*param_state->second)
+                      .momentum_buffer();
+
+            // update the momentum buffer and apply dampening
+            mul_out_hack(buf, momentum, buf);
+            add_out_hack(buf, d_p, 1 - dampening, buf);
           }
-          // update the parameter using the gradient and learning rate
-          torch::executor::aten::add_outf(
-              context, p, d_p, -1 * options.lr(), p);
-          if (context.failure_state() != Error::Ok) {
-            return context.failure_state();
+          if (nesterov) {
+            // apply nesterov momentum
+            add_out_hack(d_p, buf, momentum, d_p);
+          } else {
+            d_p = buf;
           }
-          break;
         }
+        // update the parameter using the gradient and learning rate
+        add_out_hack(p, d_p, -1 * options.lr(), p);
       }
     }
   }
@@ -170,7 +162,8 @@ SGD::~SGD() {
 #endif
   }
 }
+
 } // namespace optimizer
 } // namespace training
-} // namespace executor
-} // namespace torch
+} // namespace extension
+} // namespace executorch
diff --git a/extension/training/optimizer/sgd.h b/extension/training/optimizer/sgd.h
index 308e3471d99..00e84b1348c 100644
--- a/extension/training/optimizer/sgd.h
+++ b/extension/training/optimizer/sgd.h
@@ -18,25 +18,21 @@
 
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/span.h>
+#include <map>
 #include <memory>
 #include <unordered_map>
 #include <vector>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
 namespace training {
 namespace optimizer {
 
-using Tensor = exec_aten::Tensor;
-using TensorImpl = exec_aten::TensorImpl;
-using ScalarType = exec_aten::ScalarType;
-
 /**
  * SGD optimizer state. This keeps track of the state of a given parameter to
  * be used in later epochs.
  */
-class SGDParamState {
+class ET_EXPERIMENTAL SGDParamState {
  public:
   /**
    * Constructs a new SGD param state.
@@ -44,22 +40,22 @@ class SGDParamState {
    * @param[in] momentum_buffer A tensor that stores the momentum at the last
    * epoch.
    */
-  explicit SGDParamState(Tensor& momentum_buffer)
+  explicit SGDParamState(exec_aten::Tensor& momentum_buffer)
       : momentum_buffer_(momentum_buffer) {}
 
-  Tensor& momentum_buffer() {
+  exec_aten::Tensor& momentum_buffer() {
     return momentum_buffer_;
   }
 
  private:
-  Tensor momentum_buffer_;
+  exec_aten::Tensor momentum_buffer_;
 };
 
 /**
  * SGD optimizer options. This contains options for performing training on a
  * param group, such as the learning rate.
  */
-class SGDOptions {
+class ET_EXPERIMENTAL SGDOptions {
  public:
   /**
    * Constructs a new SGD optimizer options.
@@ -132,57 +128,47 @@ class SGDOptions {
  * SGD optimizer param group. This contains the parameters and
  * the SGDOptions associated to it.
  */
-class SGDParamGroup {
+class ET_EXPERIMENTAL SGDParamGroup {
  public:
   // NOTE: In order to store `SGDParamGroup` in a `std::vector`, it has
   // to be copy-constructible.
   SGDParamGroup(const SGDParamGroup& param_group)
-      : param_data_(param_group.param_data()),
-        param_names_(param_group.param_names()),
+      : named_parameters_(param_group.named_parameters()),
         options_(
             param_group.has_options() ? param_group.options().clone()
                                       : nullptr) {}
   SGDParamGroup& operator=(const SGDParamGroup& param_group) {
-    this->param_data_ = param_group.param_data();
-    this->param_names_ = param_group.param_names();
+    this->named_parameters_ = param_group.named_parameters_;
     this->options_ =
         param_group.has_options() ? param_group.options().clone() : nullptr;
     return *this;
   }
 
   /**
-   * This constructs a SGD param group. We expect that the two spans are of the
-   * same size, and that for a given param data, its index in param_data is the
-   * same as its param name in param_name.
+   * Constructs a SGD param group.
    *
-   * @param[in] param_names The names of the params for this group.
-   * @param[in] param_data The tensors representing the param data.
+   * @param[in] named_parameters The parameters to be optimized and their fully
+   * qualified names.
    */
   /* implicit */ SGDParamGroup(
-      Span<const char*> param_names,
-      Span<Tensor> param_data)
-      : param_data_(std::move(param_data)),
-        param_names_(std::move(param_names)) {}
+      const std::map<exec_aten::string_view, exec_aten::Tensor>&
+          named_parameters)
+      : named_parameters_(named_parameters) {}
   SGDParamGroup(
-      Span<const char*> param_names,
-      Span<Tensor> param_data,
+      const std::map<exec_aten::string_view, exec_aten::Tensor>&
+          named_parameters,
       std::unique_ptr<SGDOptions> options)
-      : param_data_(std::move(param_data)),
-        param_names_(std::move(param_names)),
-        options_(std::move(options)) {}
+      : named_parameters_(named_parameters), options_(std::move(options)) {}
 
   bool has_options() const;
   SGDOptions& options();
   const SGDOptions& options() const;
   void set_options(std::unique_ptr<SGDOptions> options);
-  Span<const char*> param_names();
-  const Span<const char*> param_names() const;
-  Span<Tensor> param_data();
-  const Span<Tensor> param_data() const;
+  const std::map<exec_aten::string_view, exec_aten::Tensor>& named_parameters()
+      const;
 
  private:
-  Span<Tensor> param_data_;
-  Span<const char*> param_names_;
+  std::map<exec_aten::string_view, exec_aten::Tensor> named_parameters_;
   std::unique_ptr<SGDOptions> options_;
 };
 
@@ -190,7 +176,7 @@ class SGDParamGroup {
  * SGD optimizer class. This is responsible for performing the optimization
  * step.
  */
-class SGD {
+class ET_EXPERIMENTAL SGD {
  public:
   explicit SGD(
       const std::vector<SGDParamGroup>& param_groups,
@@ -202,11 +188,10 @@ class SGD {
   }
 
   explicit SGD(
-      Span<const char*> param_names,
-      Span<Tensor> param_data,
+      const std::map<exec_aten::string_view, exec_aten::Tensor>&
+          named_parameters,
       SGDOptions defaults)
-      : SGD({SGDParamGroup(std::move(param_names), std::move(param_data))},
-            defaults) {}
+      : SGD({SGDParamGroup(named_parameters)}, defaults) {}
 
   // Adds the given param_group to the optimizer's param_group list.
   void add_param_group(const SGDParamGroup& param_group);
@@ -216,16 +201,12 @@ class SGD {
   /**
    * Performs the optimization step.
    *
-   * The two spans must be of the same size. It is expected that the gradient in
-   * 'gradient_data' at index 'i' represents the gradient calculated in the loss
-   * function for the parameter with the name in 'gradient_names' at index 'i'.
-   *
-   * @param[in] gradient_names The names of the params that matches the gradient
-   *   in 'gradient_data' at the same index.
-   * @param[in] gradient_data The gradient tensors to be used for optimization
-   *   step.
+   * @param[in] named_gradients The gradients of the tensors specified by the
+   * fully qualified name.
    */
-  Error step(Span<const char*> gradient_names, Span<Tensor> gradient_data);
+  ::executorch::runtime::Error step(
+      const std::map<exec_aten::string_view, exec_aten::Tensor>&
+          named_gradients);
 
  private:
   std::vector<SGDParamGroup> param_groups_;
@@ -235,5 +216,5 @@ class SGD {
 
 } // namespace optimizer
 } // namespace training
-} // namespace executor
-} // namespace torch
+} // namespace extension
+} // namespace executorch
diff --git a/extension/training/optimizer/targets.bzl b/extension/training/optimizer/targets.bzl
index 69682feaee4..3b00ae0bfdc 100644
--- a/extension/training/optimizer/targets.bzl
+++ b/extension/training/optimizer/targets.bzl
@@ -10,20 +10,19 @@ def define_common_targets():
     for aten_mode in (True, False):
         aten_suffix = "_aten" if aten_mode else ""
 
-        if aten_mode:
-            kernel_deps = [
-                "//executorch/kernels/aten:generated_lib",
-                "//executorch/kernels/aten:generated_lib_headers",
-                "//executorch/kernels/test:function_header_wrapper_aten",
-            ]
-        else:
-            kernel_deps = [
-                "//executorch/kernels/portable/cpu:op_add",
-                "//executorch/kernels/portable/cpu:op_mul",
-                "//executorch/kernels/portable/cpu:op_clone",
-                "//executorch/kernels/portable:generated_lib_headers",
-                "//executorch/kernels/test:function_header_wrapper_portable",
-            ]
+        # if aten_mode:
+        #     kernel_deps = [
+        #         "//executorch/kernels/aten:generated_lib",
+        #         "//executorch/kernels/aten:generated_lib_headers",
+        #         "//executorch/kernels/test:function_header_wrapper_aten",
+        #     ]
+        # else:
+        #     kernel_deps = [
+        #         "//executorch/kernels/portable/cpu:op_add",
+        #         "//executorch/kernels/portable/cpu:op_mul",
+        #         "//executorch/kernels/portable/cpu:op_clone",
+        #         "//executorch/kernels/portable:generated_lib_headers",
+        #     ]
 
         runtime.cxx_library(
             name = "sgd" + aten_suffix,
@@ -34,9 +33,9 @@ def define_common_targets():
                 "sgd.h",
             ],
             exported_deps = [
-                "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix,
+                "//executorch/runtime/core:core",
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
-            ] + kernel_deps,
+            ],  # + kernel_deps,
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
diff --git a/extension/training/optimizer/test/sgd_test.cpp b/extension/training/optimizer/test/sgd_test.cpp
index 1dd1a3e55df..92b329d8edb 100644
--- a/extension/training/optimizer/test/sgd_test.cpp
+++ b/extension/training/optimizer/test/sgd_test.cpp
@@ -17,12 +17,13 @@
 // @lint-ignore-every CLANGTIDY facebook-hte-CArray
 
 using namespace ::testing;
-using namespace torch::executor::training::optimizer;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using torch::executor::Error;
-using torch::executor::Span;
-using torch::executor::testing::TensorFactory;
+using ::executorch::extension::training::optimizer::SGD;
+using ::executorch::extension::training::optimizer::SGDOptions;
+using ::executorch::extension::training::optimizer::SGDParamState;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::testing::TensorFactory;
 
 class SGDOptimizerTest : public ::testing::Test {
  protected:
@@ -67,70 +68,47 @@ TEST_F(SGDOptimizerTest, SGDOptionsDefaultValuesTest) {
 TEST_F(SGDOptimizerTest, SGDOptimizerSimple) {
   TensorFactory<ScalarType::Float> tf;
 
-  const char* param_name[1] = {"param1"};
-  Span<const char*> param_names(param_name, 1);
+  std::map<exec_aten::string_view, exec_aten::Tensor> named_parameters;
+  std::map<exec_aten::string_view, exec_aten::Tensor> named_gradients;
 
-  Tensor param_data[1] = {tf.make({1, 1}, {1})};
-  Span<Tensor> param_data_span(param_data, 1);
+  named_parameters.insert({"param1", tf.make({1, 1}, {1})});
 
   // dummy gradient of -1 for all epochs
-  Tensor grad_data[1] = {tf.make({1, 1}, {-1})};
-  Span<Tensor> grad_data_span(grad_data, 1);
+  named_gradients.insert({"param1", tf.make({1, 1}, {-1})});
 
-  SGD optimizer(param_names, param_data_span, SGDOptions{0.1});
+  SGD optimizer(named_parameters, SGDOptions{0.1});
 
   for (int i = 0; i < 10; ++i) {
-    optimizer.step(param_names, grad_data_span);
+    optimizer.step(named_gradients);
   }
 
   auto p1 = static_cast<const float*>(
-      param_data_span[0].unsafeGetTensorImpl()->data());
+      named_parameters.at("param1").unsafeGetTensorImpl()->data());
   EXPECT_NEAR(p1[0], 2.0, 0.1);
 }
 
-TEST_F(SGDOptimizerTest, SGDOptimizerMismatchedGradientSpans) {
-  TensorFactory<ScalarType::Float> tf;
-
-  const char* param_name[1] = {"param1"};
-  Span<const char*> param_names(param_name, 1);
-
-  Tensor param_data[1] = {tf.make({1, 1}, {1})};
-  Span<Tensor> param_data_span(param_data, 1);
-
-  // dummy gradient of -1 for all epochs
-  Tensor grad_data[2] = {tf.make({1, 1}, {-1}), tf.make({1, 1}, {-1})};
-  Span<Tensor> grad_data_span(grad_data, 2);
-
-  SGD optimizer(param_names, param_data_span, SGDOptions{0.1});
-
-  Error error = optimizer.step(param_names, grad_data_span);
-
-  EXPECT_EQ(error, Error::InvalidState);
-}
-
 TEST_F(SGDOptimizerTest, SGDOptimizerComplex) {
   TensorFactory<ScalarType::Float> tf;
 
-  const char* param_name[2] = {"param1", "param2"};
-  Span<const char*> param_names(param_name, 2);
+  std::map<exec_aten::string_view, exec_aten::Tensor> named_parameters;
 
-  Tensor param_data[2] = {tf.make({1, 1}, {1.0}), tf.make({1, 1}, {2.0})};
-  Span<Tensor> param_data_span(param_data, 2);
+  named_parameters.insert({"param1", tf.make({1, 1}, {1.0})});
+  named_parameters.insert({"param2", tf.make({1, 1}, {2.0})});
 
-  SGD optimizer(param_names, param_data_span, SGDOptions{0.1, 0.1, 0, 2, true});
+  SGD optimizer(named_parameters, SGDOptions{0.1, 0.1, 0, 2, true});
 
   for (int i = 0; i < 10; ++i) {
+    std::map<exec_aten::string_view, exec_aten::Tensor> named_gradients;
     // dummy gradient of -1 for all epochs
-    Tensor grad_data[2] = {tf.make({1, 1}, {-1}), tf.make({1, 1}, {-1})};
-    Span<Tensor> grad_data_span(grad_data, 2);
-
-    optimizer.step(param_names, grad_data_span);
+    named_gradients.insert({"param1", tf.make({1, 1}, {-1})});
+    named_gradients.insert({"param2", tf.make({1, 1}, {-1})});
+    optimizer.step(named_gradients);
   }
 
-  auto p1 = static_cast<const float*>(
-      param_data_span[0].unsafeGetTensorImpl()->data());
-  auto p2 = static_cast<const float*>(
-      param_data_span[1].unsafeGetTensorImpl()->data());
+  auto p1 =
+      static_cast<const float*>(named_parameters.at("param1").const_data_ptr());
+  auto p2 =
+      static_cast<const float*>(named_parameters.at("param2").const_data_ptr());
   EXPECT_NEAR(p1[0], 0.540303, 0.1);
   EXPECT_NEAR(p2[0], 0.620909, 0.1);
 }
diff --git a/extension/training/test/targets.bzl b/extension/training/test/targets.bzl
index a4ec07a37f9..9710f512060 100644
--- a/extension/training/test/targets.bzl
+++ b/extension/training/test/targets.bzl
@@ -25,12 +25,12 @@ def define_common_targets(is_fbcode = False):
             ],
             deps = [
                 "//executorch/runtime/executor:program",
-                "//executorch/util:util",
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
                 "//executorch/extension/evalue_util:print_evalue",
                 "//executorch/runtime/executor/test:managed_memory_manager",
                 "//executorch/extension/training/optimizer:sgd",
+                "//executorch/extension/training/module:training_module",
                 "//executorch/kernels/portable:generated_lib",
             ],
             env = modules_env,
diff --git a/extension/training/test/training_loop_test.cpp b/extension/training/test/training_loop_test.cpp
index 28931fbfc0f..bc162ab26bb 100644
--- a/extension/training/test/training_loop_test.cpp
+++ b/extension/training/test/training_loop_test.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/training/module/training_module.h>
 #include <executorch/extension/training/optimizer/sgd.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
@@ -23,80 +24,59 @@
 // @lint-ignore-every CLANGTIDY facebook-hte-CArray
 
 using namespace ::testing;
-using namespace torch::executor::training::optimizer;
+using namespace executorch::extension::training::optimizer;
 using namespace torch::executor::testing;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using namespace torch::executor;
 using torch::executor::util::FileDataLoader;
 
-constexpr size_t kDefaultNonConstMemBytes = 32 * 1024;
-constexpr size_t kDefaultRuntimeMemBytes = 32 * 1024;
-
 class TrainingLoopTest : public ::testing::Test {
  protected:
-  void SetUp() override {
-    // Create a loader for the serialized ModuleAdd program.
-    const char* path = std::getenv("ET_MODULE_SIMPLE_TRAIN_PATH");
-    Result<FileDataLoader> loader = FileDataLoader::from(path);
-    ASSERT_EQ(loader.error(), Error::Ok);
-    loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));
-
-    // Use it to load the program.
-    Result<Program> program = Program::load(
-        loader_.get(), Program::Verification::InternalConsistency);
-    ASSERT_EQ(program.error(), Error::Ok);
-    program_ = std::make_unique<Program>(std::move(program.get()));
-  }
-
-  // Must outlive program_, but tests shouldn't need to touch it.
-  std::unique_ptr<FileDataLoader> loader_;
-
-  std::unique_ptr<Program> program_;
+  void SetUp() override {}
 };
 
 TEST_F(TrainingLoopTest, OptimizerSteps) {
-  // Execute model with constants stored in segment.
-  ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
-  Result<Method> method = program_->load_method("forward", &mmm.get());
-  ASSERT_EQ(method.error(), Error::Ok);
+  const char* path = std::getenv("ET_MODULE_SIMPLE_TRAIN_PATH");
+  executorch::runtime::Result<torch::executor::util::FileDataLoader>
+      loader_res = torch::executor::util::FileDataLoader::from(path);
+  ASSERT_EQ(loader_res.error(), Error::Ok);
+  auto loader = std::make_unique<torch::executor::util::FileDataLoader>(
+      std::move(loader_res.get()));
+
+  auto mod = executorch::extension::training::TrainingModule(std::move(loader));
 
   // Create inputs.
   TensorFactory<ScalarType::Float> tf;
   Tensor input = tf.make({3}, {1.0, 1.0, 1.0});
   Tensor label = tf.make({3}, {1.0, 0.0, 0.0});
 
-  Error e = method->set_input(input, 0);
-  e = method->set_input(label, 1);
+  auto res = mod.execute_forward_backward("forward", {input, label});
+  ASSERT_TRUE(res.ok());
 
   // Set up optimizer.
-  const char* param_name[2] = {"mod.linear1.weight", "mod.linear2.bias"};
-  Span<const char*> param_names(param_name, 2);
-
-  Tensor param_data[2] = {
-      method.get().get_output(3).toTensor(), // mod.linear1.weight
-      method.get().get_output(4).toTensor()}; // mod.linear1.bias
-  Span<Tensor> param_data_span(param_data, 2);
-
-  auto orig_data = param_data[0].data_ptr<float>()[0];
+  // Get the params and names
+  auto param_res = mod.named_parameters("forward");
+  ASSERT_EQ(param_res.error(), Error::Ok);
 
-  Tensor grad_data[2] = {
-      method.get().get_output(1).toTensor(), // mod.linear1.weight.grad
-      method.get().get_output(2).toTensor()}; // mod.linear1.bias.grad
-  ;
-  Span<Tensor> grad_data_span(grad_data, 2);
+  float orig_data = param_res.get().at("linear.weight").data_ptr<float>()[0];
 
   SGDOptions options{0.1};
-  SGD optimizer(param_names, param_data_span, options);
+  SGD optimizer(param_res.get(), options);
 
-  // Execute the method. (Forward and Backward)
-  Error err = method->execute();
-  ASSERT_EQ(err, Error::Ok);
+  // Get the gradients
+  auto grad_res = mod.named_gradients("forward");
+  ASSERT_EQ(grad_res.error(), Error::Ok);
+  auto& grad = grad_res.get();
+  ASSERT_EQ(grad.size(), 2);
+  ASSERT_NE(grad.find("linear.weight"), grad.end());
+  ASSERT_NE(grad.find("linear.bias"), grad.end());
 
   // Step
-  auto opt_err = optimizer.step(param_names, grad_data_span);
+  auto opt_err = optimizer.step(grad_res.get());
   ASSERT_EQ(opt_err, Error::Ok);
 
   // Check that the data has changed.
-  ASSERT_NE(param_data[0].data_ptr<float>()[0], orig_data);
+  ASSERT_NE(
+      param_res.get().at("linear.weight").data_ptr<float>()[0], orig_data);
 }
diff --git a/install_requirements.bat b/install_requirements.bat
new file mode 100644
index 00000000000..4cfe4b21c4b
--- /dev/null
+++ b/install_requirements.bat
@@ -0,0 +1,21 @@
+@ECHO OFF
+
+rem Copyright (c) Meta Platforms, Inc. and affiliates.
+rem All rights reserved.
+
+rem This batch file provides a basic functionality similar to the bash script.
+
+cd /d "%~dp0"
+
+rem Find the names of the python tools to use (replace with your actual python installation)
+if "%PYTHON_EXECUTABLE%"=="" (
+  if "%CONDA_DEFAULT_ENV%"=="" OR "%CONDA_DEFAULT_ENV%"=="base" OR NOT EXIST "python" (
+    set PYTHON_EXECUTABLE=python3
+  ) else (
+    set PYTHON_EXECUTABLE=python
+  )
+)
+
+"%PYTHON_EXECUTABLE%" install_requirements.py %*
+
+exit /b %ERRORLEVEL%
\ No newline at end of file
diff --git a/install_requirements.py b/install_requirements.py
index c61bdd02dbe..64243ec6943 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -1,11 +1,10 @@
-#!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
-#
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+
 import os
 import platform
 import re
@@ -83,13 +82,19 @@ def python_is_compatible():
         print(f"Error: Unknown option {arg}")
         sys.exit(1)
 
+# Use ClangCL on Windows.
+# ClangCL is an alias to Clang that configures it to work in an MSVC-compatible
+# mode. Using it on Windows to avoid compiler compatibility issues for MSVC.
+if os.name == "nt":
+    CMAKE_ARGS += " -T ClangCL"
+
 # Since ExecuTorch often uses main-branch features of pytorch, only the nightly
 # pip versions will have the required features.
 #
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION = "dev20240821"
+NIGHTLY_VERSION = "dev20240901"
 
 # The pip repository that hosts nightly torch packages.
 TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu"
@@ -116,7 +121,7 @@ def python_is_compatible():
 # TODO: Make each example publish its own requirements.txt
 EXAMPLES_REQUIREMENTS = [
     "timm==1.0.7",
-    f"torchaudio==2.4.0.{NIGHTLY_VERSION}",
+    f"torchaudio==2.5.0.{NIGHTLY_VERSION}",
     "torchsr==1.0.4",
     "transformers==4.42.4",
 ]
diff --git a/kernels/README.md b/kernels/README.md
index 4e9656e6e9e..68b0ce222b3 100644
--- a/kernels/README.md
+++ b/kernels/README.md
@@ -355,8 +355,8 @@ cmake . \
   -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-  -DEXECUTORCH_BUILD_SDK=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+  -DEXECUTORCH_BUILD_DEVTOOLS=ON \
   -DEXECUTORCH_BUILD_VULKAN=OFF \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -Bcmake-out
diff --git a/kernels/aten/cpu/op__to_dim_order_copy.cpp b/kernels/aten/cpu/op__to_dim_order_copy.cpp
index 63a301531d9..c2de14ea444 100644
--- a/kernels/aten/cpu/op__to_dim_order_copy.cpp
+++ b/kernels/aten/cpu/op__to_dim_order_copy.cpp
@@ -92,7 +92,7 @@ bool check__to_dim_order_copy_args(
 // _to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]?
 // dim_order=None, Tensor(a!) out) -> Tensor(a!)
 Tensor& _to_dim_order_copy_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& self,
     bool non_blocking,
     OptionalArrayRef<int64_t> dim_order,
@@ -115,7 +115,7 @@ Tensor& _to_dim_order_copy_out(
     bool non_blocking,
     OptionalArrayRef<int64_t> dim_order,
     Tensor& out) {
-  exec_aten::RuntimeContext ctx{};
+  executorch::runtime::KernelRuntimeContext ctx{};
   return _to_dim_order_copy_out(ctx, self, non_blocking, dim_order, out);
 }
 
diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml
index f95169a068e..cba03b8a743 100644
--- a/kernels/aten/functions.yaml
+++ b/kernels/aten/functions.yaml
@@ -115,6 +115,8 @@
 
 - op: convolution.out
 
+- op: convolution_backward.out
+
 - op: copy.out
 
 - op: cos.out
@@ -173,6 +175,8 @@
 
 - op: full.out
 
+- op: gather.out
+
 - op: ge.Scalar_out
 
 - op: ge.Tensor_out
@@ -211,6 +215,8 @@
 
 - op: linalg_vector_norm.out
 
+- op: linear.out
+
 - op: log.out
 
 - op: log10.out
@@ -255,6 +261,8 @@
 
 - op: mul.Scalar_out
 
+- op: narrow_copy.out
+
 - op: native_batch_norm.out
 
 - op: native_group_norm.out
@@ -275,6 +283,8 @@
 
 - op: pixel_shuffle.out
 
+- op: pixel_unshuffle.out
+
 - op: pow.Scalar_out
 
 - op: pow.Tensor_Tensor_out
@@ -317,6 +327,10 @@
 
 - op: scalar_tensor.out
 
+- op: scatter.src_out
+
+- op: scatter.value_out
+
 - op: scatter_add.out
 
 - op: select_copy.int_out
@@ -396,3 +410,9 @@
 - op: zeros_like.out
 
 - op: zeros.out
+
+- op: gather.out
+
+- op: scatter.value_out
+
+- op: aten::native_dropout.out
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 858e51160e5..70d19343e4a 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -42,7 +42,9 @@ endif()
 # Build cpublas.
 list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(cpublas STATIC ${_optimized_cpublas__srcs})
-target_link_libraries(cpublas PRIVATE executorch_no_prim_ops eigen_blas)
+target_link_libraries(
+  cpublas PRIVATE executorch_no_prim_ops eigen_blas extension_threadpool
+)
 target_compile_options(cpublas PUBLIC ${_common_compile_options})
 
 # Generate C++ bindings to register kernels into both PyTorch (for AOT) and
@@ -58,7 +60,9 @@ message("Generated files ${gen_command_sources}")
 
 list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(optimized_kernels ${_optimized_kernels__srcs})
-target_link_libraries(optimized_kernels PRIVATE executorch_no_prim_ops cpublas)
+target_link_libraries(
+  optimized_kernels PRIVATE executorch_no_prim_ops cpublas extension_threadpool
+)
 target_compile_options(optimized_kernels PUBLIC ${_common_compile_options})
 # Build a library for _optimized_kernels_srcs
 #
diff --git a/kernels/optimized/blas/BlasKernel.cpp b/kernels/optimized/blas/BlasKernel.cpp
new file mode 100644
index 00000000000..a3e2172504d
--- /dev/null
+++ b/kernels/optimized/blas/BlasKernel.cpp
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/optimized/blas/BlasKernel.h>
+
+#ifdef __aarch64__
+#include <arm_neon.h>
+#include <cpuinfo.h>
+#endif
+
+using torch::executor::BFloat16;
+
+namespace executorch {
+namespace cpublas {
+namespace internal {
+#ifdef __aarch64__
+static inline float32x4_t f32_fma(float32x4_t a, float32x4_t b, float32x4_t c) {
+#ifdef __ARM_FEATURE_FMA
+  return vfmaq_f32(a, b, c);
+#else
+  return vaddq_f32(a, vmulq_f32(b, c));
+#endif // __ARM_FEATURE_FMA
+}
+
+// The below reduce overload and fp16_dot_with_fp32_arith are adapted
+// from llama.cpp's ggml_vec_dot_f32 and surrounding utility
+// functions. See NOTE [ GGML Copyright Notice ] above for the
+// required notice.
+
+// We need the shift for reduce(), hence the extra constants.
+static constexpr auto kF32ElementsPerIterationShift = 5;
+static constexpr auto kF32ElementsPerIteration = 1
+    << kF32ElementsPerIterationShift;
+static_assert(kF32ElementsPerIteration == 32);
+
+static constexpr auto kF32ElementsPerRegisterShift = 2;
+static constexpr auto kF32ElementsPerRegister = 1
+    << kF32ElementsPerRegisterShift;
+static_assert(kF32ElementsPerRegister == 4);
+
+static constexpr auto kF32RegisterPairsPerIteration = 4;
+static constexpr auto kF32RegistersPerIteration =
+    kF32RegisterPairsPerIteration * 2;
+static constexpr auto kF32RegistersPerIterationShift = 3;
+static_assert(
+    kF32RegistersPerIteration ==
+    kF32ElementsPerIteration / kF32ElementsPerRegister);
+static_assert(kF32RegistersPerIteration == 1 << kF32RegistersPerIterationShift);
+
+static inline double reduce(float32x4_t x[kF32RegistersPerIteration]) {
+  int offset = kF32RegistersPerIteration;
+  utils::ForcedUnroll<kF32RegistersPerIterationShift>{}(
+      [&offset, &x](auto idx) ET_INLINE_ATTRIBUTE {
+        offset /= 2;
+        for (int i = 0; i < offset; ++i) {
+          x[i] = vaddq_f32(x[i], x[offset + i]);
+        }
+      });
+  return vaddvq_f32(x[0]);
+}
+
+static ET_INLINE float32x4_t to_bfloat16(uint16x4_t u16) {
+  int32x4_t shift = vdupq_n_s32(16);
+  return vreinterpretq_f32_u32(vshlq_u32(vmovl_u16(u16), shift));
+}
+
+static ET_INLINE float32x4_t
+f32_fma_bf16(float32x4_t a, uint16x4_t b, uint16x4_t c) {
+  return f32_fma(a, to_bfloat16(b), to_bfloat16(c));
+}
+
+#define ET_TARGET_ARM_BF16_ATTRIBUTE \
+  __attribute__((target("arch=armv8.2-a+bf16")))
+ET_TARGET_ARM_BF16_ATTRIBUTE static ET_INLINE float32x4_t
+f32_dot_bf16(float32x4_t a, bfloat16x8_t b, bfloat16x8_t c) {
+  return vbfdotq_f32(a, b, c);
+}
+
+ET_TARGET_ARM_BF16_ATTRIBUTE static ET_INLINE void
+dot_with_fp32_arith_main_inner_loop_bfdot(
+    const BFloat16* vec1,
+    const BFloat16* vec2,
+    float32x4_t sum[kF32RegistersPerIteration],
+    int registerPairIndex) {
+  const bfloat16x8_t temp_vec1 = vld1q_bf16(reinterpret_cast<const __bf16*>(
+      &vec1[registerPairIndex * 2 * kF32ElementsPerRegister]));
+  const bfloat16x8_t temp_vec2 = vld1q_bf16(reinterpret_cast<const __bf16*>(
+      &vec2[registerPairIndex * 2 * kF32ElementsPerRegister]));
+  sum[registerPairIndex] =
+      f32_dot_bf16(sum[registerPairIndex], temp_vec1, temp_vec2);
+}
+
+static ET_INLINE void dot_with_fp32_arith_main_inner_loop_no_bfdot(
+    const BFloat16* vec1,
+    const BFloat16* vec2,
+    float32x4_t sum[kF32RegistersPerIteration],
+    int registerPairIndex) {
+  const uint16x8_t temp_vec1 = vld1q_u16(reinterpret_cast<const uint16_t*>(
+      &vec1[registerPairIndex * 2 * kF32ElementsPerRegister]));
+  const uint16x8_t temp_vec2 = vld1q_u16(reinterpret_cast<const uint16_t*>(
+      &vec2[registerPairIndex * 2 * kF32ElementsPerRegister]));
+
+  sum[2 * registerPairIndex] = f32_fma_bf16(
+      sum[2 * registerPairIndex],
+      vget_low_u16(temp_vec1),
+      vget_low_u16(temp_vec2));
+  sum[2 * registerPairIndex + 1] = f32_fma_bf16(
+      sum[2 * registerPairIndex + 1],
+      vget_high_u16(temp_vec1),
+      vget_high_u16(temp_vec2));
+}
+
+template <bool useBfdot>
+ET_TARGET_ARM_BF16_ATTRIBUTE static ET_INLINE void
+dot_with_fp32_arith_main_inner_loop(
+    const BFloat16* vec1,
+    const BFloat16* vec2,
+    float32x4_t sum[kF32RegistersPerIteration],
+    int registerPairIndex) {
+  if constexpr (useBfdot) {
+    dot_with_fp32_arith_main_inner_loop_bfdot(
+        vec1, vec2, sum, registerPairIndex);
+  } else {
+    dot_with_fp32_arith_main_inner_loop_no_bfdot(
+        vec1, vec2, sum, registerPairIndex);
+  }
+}
+
+static ET_INLINE void dot_with_fp32_arith_vectorized_tail_inner_loop(
+    const BFloat16* vec1,
+    const BFloat16* vec2,
+    float32x4_t* tailSum,
+    int idx) {
+  const auto temp_vec1 =
+      vld1_u16(reinterpret_cast<const uint16_t*>(&vec1[idx]));
+  const auto temp_vec2 =
+      vld1_u16(reinterpret_cast<const uint16_t*>(&vec2[idx]));
+  *tailSum = f32_fma_bf16(*tailSum, temp_vec1, temp_vec2);
+}
+
+namespace {
+template <int n>
+struct ForcedUnrollTargetBFloat16 {
+  template <typename Func>
+  ET_TARGET_ARM_BF16_ATTRIBUTE ET_INLINE void operator()(const Func& f) const {
+    ForcedUnrollTargetBFloat16<n - 1>{}(f);
+    f(n - 1);
+  }
+};
+
+template <>
+struct ForcedUnrollTargetBFloat16<1> {
+  template <typename Func>
+  ET_TARGET_ARM_BF16_ATTRIBUTE ET_INLINE void operator()(const Func& f) const {
+    f(0);
+  }
+};
+
+} // namespace
+
+template <typename T, bool useBFloat16Dot>
+ET_TARGET_ARM_BF16_ATTRIBUTE float
+dot_with_fp32_arith(const T* vec1, const T* vec2, int64_t len) {
+  float32x4_t sum[kF32RegistersPerIteration] = {vdupq_n_f32(0)};
+  const auto len_aligned = len & ~(kF32ElementsPerIteration - 1);
+  for (int j = 0; j < len_aligned; j += kF32ElementsPerIteration) {
+    const auto* vec1_ = vec1 + j;
+    const auto* vec2_ = vec2 + j;
+    ForcedUnrollTargetBFloat16<kF32RegisterPairsPerIteration>{}(
+        [vec1_, vec2_, &sum](auto k)
+            ET_INLINE_ATTRIBUTE ET_TARGET_ARM_BF16_ATTRIBUTE {
+              dot_with_fp32_arith_main_inner_loop<useBFloat16Dot>(
+                  vec1_, vec2_, sum, k);
+            });
+  }
+  auto reducedSum = reduce(sum);
+
+  // First-tier tail fixup: make sure we handle workloads that can
+  // benefit from vectorization, but don't fit into our fully unrolled
+  // loop above.
+  float32x4_t tailSum = vdupq_n_f32(0);
+  const auto len_aligned_4 = len & ~3;
+  for (int j = len_aligned; j < len_aligned_4; j += 4) {
+    dot_with_fp32_arith_vectorized_tail_inner_loop(vec1, vec2, &tailSum, j);
+  }
+  auto reducedTail = vpaddq_f32(tailSum, tailSum);
+  reducedSum += vgetq_lane_f32(vpaddq_f32(reducedTail, reducedTail), 0);
+
+  // Second-tier tail fixup: handle all workloads.
+  for (int j = len_aligned_4; j < len; ++j) {
+    reducedSum += vec1[j] * vec2[j];
+  }
+  return reducedSum;
+}
+
+float bf16_dot_with_fp32_arith(
+    const BFloat16* vec1,
+    const BFloat16* vec2,
+    int64_t len) {
+  if (cpuinfo_has_arm_bf16()) {
+    return dot_with_fp32_arith<BFloat16, true>(vec1, vec2, len);
+  } else {
+    return dot_with_fp32_arith<BFloat16, false>(vec1, vec2, len);
+  }
+}
+#endif // __aarch64__
+} // namespace internal
+} // namespace cpublas
+} // namespace executorch
diff --git a/kernels/optimized/blas/BlasKernel.h b/kernels/optimized/blas/BlasKernel.h
index 10b568c50d3..2c03ed0b638 100644
--- a/kernels/optimized/blas/BlasKernel.h
+++ b/kernels/optimized/blas/BlasKernel.h
@@ -11,8 +11,17 @@
 #include <executorch/kernels/optimized/utils/math_utils.h>
 #include <executorch/kernels/optimized/utils/unroll.h>
 
+#include <executorch/extension/parallel/thread_parallel.h>
+#include <executorch/runtime/core/portable_type/bfloat16.h>
+
 #include <array>
 
+namespace torch {
+namespace executor {
+struct BFloat16;
+} // namespace executor
+} // namespace torch
+
 namespace executorch {
 namespace cpublas {
 
@@ -154,6 +163,55 @@ void gemm_transa_(
     a_ += lda;
   }
 }
+
+#ifdef __aarch64__
+namespace internal {
+float bf16_dot_with_fp32_arith(const torch::executor::BFloat16* vec1, const torch::executor::BFloat16* vec2, int64_t len);
+} // namespace internal
+
+template <>
+inline void gemm_transa_<torch::executor::BFloat16, torch::executor::BFloat16>(
+    int64_t m, int64_t n, int64_t k,
+    torch::executor::BFloat16 alpha,
+    const torch::executor::BFloat16 *a, int64_t lda,
+    const torch::executor::BFloat16 *b, int64_t ldb,
+    torch::executor::BFloat16 beta,
+    torch::executor::BFloat16 *c, int64_t ldc) {
+  // c = alpha * (a.T @ b) + beta * c
+  if (alpha == 1 && beta == 0) {
+    executorch::extension::parallel_for(0, m, 1, [&](int64_t begin, int64_t end) {
+      const auto *a_ = a + begin * lda;
+      for (int i = begin; i < end; ++i) {
+        const auto *b_ = b;
+        for (int j = 0; j < n; ++j) {
+          const auto dot = internal::bf16_dot_with_fp32_arith(a_, b_, k);
+          b_ += ldb;
+          c[j*ldc+i] = dot;
+        }
+        a_ += lda;
+      }
+    });
+    return;
+  }
+  executorch::extension::parallel_for(0, m, 1, [&](int64_t begin, int64_t end) {
+    const auto *a_ = a + begin * lda;
+    for (int i = begin; i < end; ++i) {
+      const auto *b_ = b;
+      for (int j = 0; j < n; ++j) {
+        const auto dot = internal::bf16_dot_with_fp32_arith(a_, b_, k);
+        b_ += ldb;
+        if (beta == 0) {
+          c[j*ldc+i] = alpha*dot;
+        } else {
+          c[j*ldc+i] = beta*c[j*ldc+i]+alpha*dot;
+        }
+      }
+      a_ += lda;
+    }
+  });
+}
+#endif
+
 // clang-format on
 
 template <typename scalar_t, typename opmath_t>
diff --git a/kernels/optimized/blas/CPUBlas.cpp b/kernels/optimized/blas/CPUBlas.cpp
index 35b208d30fc..d30064b953c 100644
--- a/kernels/optimized/blas/CPUBlas.cpp
+++ b/kernels/optimized/blas/CPUBlas.cpp
@@ -24,7 +24,8 @@ extern "C" void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float
 namespace executorch {
 namespace cpublas {
 
-// using Half = exec_aten::Half;
+using exec_aten::BFloat16;
+using exec_aten::Half;
 
 #ifdef ET_BUILD_WITH_BLAS
 #ifdef ET_BUILD_FOR_APPLE
@@ -173,5 +174,28 @@ void gemm(
 }
 // clang-format on
 
+// clang-format off
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const BFloat16 alpha,
+    const BFloat16 *a, int64_t lda,
+    const BFloat16 *b, int64_t ldb,
+    const BFloat16 beta,
+    BFloat16 *c, int64_t ldc) {
+  normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
+
+  using acc_type = utils::compute_dtype<BFloat16>;
+  gemm_impl(
+      transa, transb,
+      m, n, k,
+      static_cast<const acc_type>(alpha),
+      a, lda,
+      b, ldb,
+      static_cast<const acc_type>(beta),
+      c, ldc);
+}
+// clang-format on
+
 } // namespace cpublas
 } // namespace executorch
diff --git a/kernels/optimized/blas/CPUBlas.h b/kernels/optimized/blas/CPUBlas.h
index dd4a24cbce0..89f0992e30f 100644
--- a/kernels/optimized/blas/CPUBlas.h
+++ b/kernels/optimized/blas/CPUBlas.h
@@ -17,8 +17,6 @@
 namespace executorch {
 namespace cpublas {
 
-using Half = torch::executor::Half;
-
 enum class TransposeType {
   NoTranspose,
   Transpose,
@@ -99,11 +97,20 @@ void gemm(
 void gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
-    const Half alpha,
-    const Half *a, int64_t lda,
-    const Half *b, int64_t ldb,
-    const Half beta,
-    Half *c, int64_t ldc);
+    const exec_aten::Half alpha,
+    const exec_aten::Half *a, int64_t lda,
+    const exec_aten::Half *b, int64_t ldb,
+    const exec_aten::Half beta,
+    exec_aten::Half *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const exec_aten::BFloat16 alpha,
+    const exec_aten::BFloat16 *a, int64_t lda,
+    const exec_aten::BFloat16 *b, int64_t ldb,
+    const exec_aten::BFloat16 beta,
+    exec_aten::BFloat16 *c, int64_t ldc);
 // clang-format on
 
 // clang-format off
diff --git a/kernels/optimized/cpu/binary_ops.h b/kernels/optimized/cpu/binary_ops.h
new file mode 100644
index 00000000000..6d941509f72
--- /dev/null
+++ b/kernels/optimized/cpu/binary_ops.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace internal {
+// NOTE: we bake ArrayRef iterators being pointers into the return
+// type here because we assume that iterators are portable across
+// ArrayRef copies.
+inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s(
+    ArrayRef<Tensor::SizesType> arr) {
+  return std::find_if(
+      arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; });
+}
+
+inline bool sizes_match_ignoring_leading_1s(
+    ArrayRef<Tensor::SizesType> lhs,
+    ArrayRef<Tensor::SizesType> rhs) {
+  auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs);
+  auto lhs_end = lhs.end();
+
+  auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs);
+  auto rhs_end = rhs.end();
+
+  return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) &&
+      std::equal(lhs_begin, lhs_end, rhs_begin);
+}
+} // namespace internal
+
+enum class ElementwiseOptimizedPath {
+  kNone,
+  kTreatAs1d,
+  kBroadcast2dBy1d,
+  kBroadcast2dBy1dReverseArguments,
+};
+
+namespace internal {
+inline ElementwiseOptimizedPath select_broadcast_2d_by_1d_optimized_path(
+    const Tensor& lhs,
+    const Tensor& rhs) {
+  auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs.sizes());
+  auto lhs_end = lhs.sizes().end();
+
+  auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs.sizes());
+  auto rhs_end = rhs.sizes().end();
+
+  const auto lhs_size = lhs_end - lhs_begin;
+  const auto rhs_size = rhs_end - rhs_begin;
+  if (lhs_size == 2 && rhs_size == 1 && lhs_begin[1] == rhs_begin[0]) {
+    return ElementwiseOptimizedPath::kBroadcast2dBy1d;
+  }
+
+  if (lhs_size == 1 && rhs_size == 2 && rhs_begin[1] == lhs_begin[0]) {
+    return ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments;
+  }
+
+  return ElementwiseOptimizedPath::kNone;
+}
+} // namespace internal
+
+ElementwiseOptimizedPath inline select_optimized_path(
+    const Tensor& a,
+    const Tensor& b,
+    const Tensor& out) {
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType out_type = out.scalar_type();
+
+  if (a_type != b_type || a_type != out_type || a_type == ScalarType::Half ||
+      a_type == ScalarType::BFloat16) {
+    return ElementwiseOptimizedPath::kNone;
+  }
+  if (a.sizes().equals(b.sizes()) ||
+      (a.numel() == b.numel() &&
+       (a.numel() == out.numel() ||
+        internal::sizes_match_ignoring_leading_1s(a.sizes(), b.sizes())))) {
+    return ElementwiseOptimizedPath::kTreatAs1d;
+  }
+  return internal::select_broadcast_2d_by_1d_optimized_path(a, b);
+}
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp
index b62c3b154fa..2b31a8d5db9 100644
--- a/kernels/optimized/cpu/op_add.cpp
+++ b/kernels/optimized/cpu/op_add.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/optimized/cpu/binary_ops.h>
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/optimized/vec/vec.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
@@ -70,7 +71,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& opt_add_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     const Scalar& alpha,
@@ -81,8 +82,41 @@ Tensor& opt_add_out(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
-  if (a_type == b_type && a_type == out_type && a.sizes().equals(b.sizes()) &&
-      a_type != ScalarType::Half) {
+  if (b.numel() == 1) {
+    if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half &&
+        a_type != ScalarType::BFloat16) {
+      ET_KERNEL_CHECK(
+          ctx,
+          resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+          InvalidArgument,
+          out);
+
+      ET_SWITCH_REALB_TYPES(a_type, ctx, "add.out", CTYPE, [&]() {
+        ET_SWITCH_REALB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() {
+          CTYPE alpha_val;
+          ET_KERNEL_CHECK(
+              ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
+          CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
+          CTYPE b_casted = static_cast<CTYPE>(b_val);
+
+          using Vec = executorch::vec::Vectorized<CTYPE>;
+          executorch::vec::map<CTYPE>(
+              [alpha_val, b_casted](Vec x) {
+                return x + Vec(alpha_val * b_casted);
+              },
+              out.mutable_data_ptr<CTYPE>(),
+              a.const_data_ptr<CTYPE>(),
+              out.numel());
+        });
+      });
+      return out;
+    }
+  } else if (a.numel() == 1) {
+    return opt_add_out(ctx, b, a, alpha, out);
+  }
+
+  auto selected_optimized_path = select_optimized_path(a, b, out);
+  if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());
     ET_KERNEL_CHECK_MSG(
@@ -105,6 +139,42 @@ Tensor& opt_add_out(
           b.const_data_ptr<CTYPE>(),
           out.numel());
     });
+  } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
+    const Tensor* lhs;
+    const Tensor* rhs;
+    if (selected_optimized_path ==
+        ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
+      lhs = &b;
+      rhs = &a;
+    } else {
+      // Catch failure to update logic when adding new broadcasting possibility.
+      ET_DCHECK(
+          selected_optimized_path ==
+          ElementwiseOptimizedPath::kBroadcast2dBy1d);
+      lhs = &a;
+      rhs = &b;
+    }
+    auto error = resize_tensor(out, lhs->sizes());
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        error == Error::Ok,
+        InvalidArgument,
+        out,
+        "Failed to resize output tensor.");
+    ET_SWITCH_REALB_TYPES(out_type, ctx, "add.out", CTYPE, [&]() {
+      CTYPE alpha_val;
+      ET_KERNEL_CHECK(
+          ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
+
+      using Vec = executorch::vec::Vectorized<CTYPE>;
+      executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
+          [alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
+          out.mutable_data_ptr<CTYPE>(),
+          lhs->const_data_ptr<CTYPE>(),
+          rhs->const_data_ptr<CTYPE>(),
+          lhs->sizes()[lhs->dim() - 2],
+          lhs->sizes()[lhs->dim() - 1]);
+    });
   } else {
     ScalarType common_type =
         promoteTypes(a_type, b_type, /*half_to_float*/ true);
@@ -116,12 +186,12 @@ Tensor& opt_add_out(
         InvalidArgument,
         out);
 
-    ET_SWITCH_REALHB_TYPES(a_type, ctx, "add.out", CTYPE_A, [&]() {
-      ET_SWITCH_REALHB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "add.out", CTYPE_A, [&]() {
+      ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() {
         using CTYPE_IN = typename torch::executor::
             promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
         ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-        ET_SWITCH_REALHB_TYPES(out_type, ctx, "add.out", CTYPE_OUT, [&]() {
+        ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "add.out", CTYPE_OUT, [&]() {
           CTYPE_IN alpha_val;
           ET_KERNEL_CHECK(
               ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
@@ -141,7 +211,7 @@ Tensor& opt_add_out(
 }
 
 Tensor& opt_add_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     const Scalar& alpha,
@@ -156,7 +226,7 @@ Tensor& opt_add_scalar_out(
 
   ET_CHECK(common_type == out_type);
 
-  if (common_type == ScalarType::Half) {
+  if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) {
     common_type = ScalarType::Float;
   }
 
@@ -165,7 +235,7 @@ Tensor& opt_add_scalar_out(
   ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
 
   if (a_type == common_type && a_type == out_type &&
-      a_type != ScalarType::Half) {
+      a_type != ScalarType::Half && a_type != ScalarType::BFloat16) {
     ET_SWITCH_REALB_TYPES(a_type, ctx, "add.Scalar_out", CTYPE, [&]() {
       ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "add.Scalar_out", CTYPE_B, [&]() {
         CTYPE_B b_val;
@@ -185,11 +255,11 @@ Tensor& opt_add_scalar_out(
       });
     });
   } else {
-    ET_SWITCH_REALHB_TYPES(a_type, ctx, "add.Scalar_out", CTYPE_A, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "add.Scalar_out", CTYPE_A, [&]() {
       ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "add.Scalar_out", CTYPE_B, [&]() {
         ET_SWITCH_REALB_TYPES(
             common_type, ctx, "add.Scalar_out", CTYPE_IN, [&]() {
-              ET_SWITCH_REALHB_TYPES(
+              ET_SWITCH_REALHBBF16_TYPES(
                   out_type, ctx, "add.Scalar_out", CTYPE_OUT, [&]() {
                     CTYPE_B b_val;
                     ET_EXTRACT_SCALAR(b, b_val);
diff --git a/kernels/optimized/cpu/op_bmm.cpp b/kernels/optimized/cpu/op_bmm.cpp
index c8a282c02f6..86a318e725e 100644
--- a/kernels/optimized/cpu/op_bmm.cpp
+++ b/kernels/optimized/cpu/op_bmm.cpp
@@ -137,7 +137,7 @@ Error resize_out_tensor(const Tensor& self, const Tensor& mat2, Tensor& out) {
 
 // bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
 Tensor& opt_bmm_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& self,
     const Tensor& mat2,
     Tensor& out) {
diff --git a/kernels/optimized/cpu/op_div.cpp b/kernels/optimized/cpu/op_div.cpp
index cdd156910b4..4d7b8efe9e3 100644
--- a/kernels/optimized/cpu/op_div.cpp
+++ b/kernels/optimized/cpu/op_div.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/optimized/cpu/binary_ops.h>
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/optimized/vec/vec.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
@@ -38,7 +39,7 @@ ScalarType get_compute_type(ScalarType a_type, ScalarType b_type) {
 } // namespace
 
 Tensor& opt_div_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -48,7 +49,58 @@ Tensor& opt_div_out(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
-  if (a_type == b_type && a_type == out_type && a.sizes().equals(b.sizes())) {
+  if (a.numel() == 1 || b.numel() == 1) {
+    if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) {
+      const Tensor* tensor;
+      const Tensor* scalar;
+      ScalarType tensor_type;
+      ScalarType scalar_type;
+      if (a.numel() == 1) {
+        tensor = &b;
+        tensor_type = b_type;
+        scalar = &a;
+        scalar_type = a_type;
+      } else {
+        tensor = &a;
+        tensor_type = a_type;
+        scalar = &b;
+        scalar_type = b_type;
+      }
+      ET_KERNEL_CHECK(
+          ctx,
+          resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+          InvalidArgument,
+          out);
+      ET_SWITCH_REALB_TYPES(tensor_type, ctx, "div.out", CTYPE, [&]() {
+        ET_SWITCH_REALB_TYPES(scalar_type, ctx, "div.out", CTYPE_SCALAR, [&]() {
+          CTYPE_SCALAR scalar_val = *scalar->const_data_ptr<CTYPE_SCALAR>();
+          CTYPE scalar_casted = static_cast<CTYPE>(scalar_val);
+
+          using Vec = executorch::vec::Vectorized<CTYPE>;
+          if (a.numel() == 1) {
+            executorch::vec::map<CTYPE>(
+                [scalar_casted](Vec x) { return Vec(scalar_casted) / x; },
+                out.mutable_data_ptr<CTYPE>(),
+                tensor->const_data_ptr<CTYPE>(),
+                out.numel());
+          } else {
+            Vec inv_scalar_casted_vec(CTYPE(1) / scalar_casted);
+            executorch::vec::map<CTYPE>(
+                [inv_scalar_casted_vec](Vec x) {
+                  return x * inv_scalar_casted_vec;
+                },
+                out.mutable_data_ptr<CTYPE>(),
+                tensor->const_data_ptr<CTYPE>(),
+                out.numel());
+          }
+        });
+      });
+      return out;
+    }
+  }
+
+  auto selected_optimized_path = select_optimized_path(a, b, out);
+  if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());
     ET_KERNEL_CHECK_MSG(
@@ -67,6 +119,49 @@ Tensor& opt_div_out(
           b.const_data_ptr<CTYPE>(),
           out.numel());
     });
+  } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
+    const Tensor* lhs;
+    const Tensor* rhs;
+    if (selected_optimized_path ==
+        ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
+      lhs = &b;
+      rhs = &a;
+    } else {
+      // Catch failure to update logic when subing new broadcasting possibility.
+      ET_DCHECK(
+          selected_optimized_path ==
+          ElementwiseOptimizedPath::kBroadcast2dBy1d);
+      lhs = &a;
+      rhs = &b;
+    }
+    auto error = resize_tensor(out, lhs->sizes());
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        error == Error::Ok,
+        InvalidArgument,
+        out,
+        "Failed to resize output tensor.");
+    ET_SWITCH_REALB_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
+      using Vec = executorch::vec::Vectorized<CTYPE>;
+      if (selected_optimized_path ==
+          ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
+        executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
+            [](Vec x, Vec y) { return y / x; },
+            out.mutable_data_ptr<CTYPE>(),
+            lhs->const_data_ptr<CTYPE>(),
+            rhs->const_data_ptr<CTYPE>(),
+            lhs->sizes()[lhs->dim() - 2],
+            lhs->sizes()[lhs->dim() - 1]);
+      } else {
+        executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
+            [](Vec x, Vec y) { return x / y; },
+            out.mutable_data_ptr<CTYPE>(),
+            lhs->const_data_ptr<CTYPE>(),
+            rhs->const_data_ptr<CTYPE>(),
+            lhs->sizes()[lhs->dim() - 2],
+            lhs->sizes()[lhs->dim() - 1]);
+      }
+    });
   } else {
     ScalarType common_type = get_compute_type(a_type, b_type);
     ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
@@ -77,25 +172,23 @@ Tensor& opt_div_out(
         InvalidArgument,
         out);
 
-    ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() {
-      ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() {
-        ET_SWITCH_REAL_TYPES_AND(
-            Bool, common_type, ctx, "div.out", CTYPE_IN, [&]() {
-              ET_SWITCH_REAL_TYPES_AND(
-                  Bool, out_type, ctx, "div.out", CTYPE_OUT, [&]() {
-                    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-                        [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                          CTYPE_IN value = a_casted / b_casted;
-
-                          return static_cast<CTYPE_OUT>(value);
-                        },
-                        a,
-                        b,
-                        out);
-                  });
-            });
+    ET_SWITCH_REALB_TYPES(a_type, ctx, "div.out", CTYPE_A, [&]() {
+      ET_SWITCH_REALB_TYPES(b_type, ctx, "div.out", CTYPE_B, [&]() {
+        ET_SWITCH_REALB_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() {
+          ET_SWITCH_REALB_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() {
+            apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+                [](const CTYPE_A val_a, const CTYPE_B val_b) {
+                  CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                  CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+                  CTYPE_IN value = a_casted / b_casted;
+
+                  return static_cast<CTYPE_OUT>(value);
+                },
+                a,
+                b,
+                out);
+          });
+        });
       });
     });
   }
@@ -104,7 +197,7 @@ Tensor& opt_div_out(
 }
 
 Tensor& opt_div_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -130,8 +223,9 @@ Tensor& opt_div_scalar_out(
             CTYPE b_casted = static_cast<CTYPE>(b_val);
 
             using Vec = executorch::vec::Vectorized<CTYPE>;
+            Vec inv_b_casted_vec(CTYPE(1) / b_casted);
             executorch::vec::map<CTYPE>(
-                [b_casted](Vec x) { return x / Vec(b_casted); },
+                [inv_b_casted_vec](Vec x) { return x * inv_b_casted_vec; },
                 out.mutable_data_ptr<CTYPE>(),
                 a.const_data_ptr<CTYPE>(),
                 out.numel());
@@ -149,6 +243,7 @@ Tensor& opt_div_scalar_out(
                             CTYPE_B b_val;
                             ET_EXTRACT_SCALAR(b, b_val);
                             CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
+                            CTYPE_IN inv_b_casted = CTYPE_IN(1) / b_casted;
 
                             const size_t n = a.numel();
                             const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
@@ -156,7 +251,8 @@ Tensor& opt_div_scalar_out(
                                 out.mutable_data_ptr<CTYPE_OUT>();
                             for (auto i = 0; i < n; ++i) {
                               out_data[i] = static_cast<CTYPE_OUT>(
-                                  static_cast<CTYPE_IN>(a_data[i]) / b_casted);
+                                  static_cast<CTYPE_IN>(a_data[i]) *
+                                  inv_b_casted);
                             }
                           });
                     });
diff --git a/kernels/optimized/cpu/op_exp.cpp b/kernels/optimized/cpu/op_exp.cpp
index cec80b76056..b1b4098525f 100644
--- a/kernels/optimized/cpu/op_exp.cpp
+++ b/kernels/optimized/cpu/op_exp.cpp
@@ -27,8 +27,8 @@ template <
     typename CTYPE_OUT,
     typename std::enable_if<
         std::is_same<CTYPE_IN, CTYPE_OUT>::value &&
-            !std::is_same<CTYPE_IN, torch::executor::Half>::value &&
-            !std::is_same<CTYPE_OUT, torch::executor::Half>::value,
+            !std::is_same<CTYPE_IN, exec_aten::Half>::value &&
+            !std::is_same<CTYPE_OUT, exec_aten::Half>::value,
         int>::type = 0>
 void exp_data(
     const CTYPE_IN* in_data,
@@ -47,8 +47,8 @@ template <
     typename CTYPE_OUT,
     typename std::enable_if<
         !std::is_same<CTYPE_IN, CTYPE_OUT>::value ||
-            std::is_same<CTYPE_IN, torch::executor::Half>::value ||
-            std::is_same<CTYPE_OUT, torch::executor::Half>::value,
+            std::is_same<CTYPE_IN, exec_aten::Half>::value ||
+            std::is_same<CTYPE_OUT, exec_aten::Half>::value,
         int>::type = 0>
 void exp_data(
     const CTYPE_IN* in_data,
@@ -62,7 +62,7 @@ void exp_data(
 
 } // namespace
 
-Tensor& opt_exp_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& opt_exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   // Resize for dynamic shape
diff --git a/kernels/optimized/cpu/op_gelu.cpp b/kernels/optimized/cpu/op_gelu.cpp
index e65f3008484..e2c48f61c4d 100644
--- a/kernels/optimized/cpu/op_gelu.cpp
+++ b/kernels/optimized/cpu/op_gelu.cpp
@@ -38,7 +38,7 @@ namespace {
  */
 template <typename CTYPE>
 void gelu(
-    exec_aten::RuntimeContext& context,
+    executorch::runtime::KernelRuntimeContext& context,
     const Tensor& input,
     string_view approximate,
     Tensor& output) {
@@ -110,7 +110,7 @@ void gelu(
  * gelu.out(Tensor self, str approximate, *, Tensor(a!) out) -> Tensor(a!)
  */
 Tensor& opt_gelu_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& input,
     string_view approximate,
     Tensor& out) {
diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp
index 15481403c2d..3f4a1fec8bb 100644
--- a/kernels/optimized/cpu/op_le.cpp
+++ b/kernels/optimized/cpu/op_le.cpp
@@ -20,7 +20,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& opt_le_tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -82,7 +82,7 @@ Tensor& opt_le_tensor_out(
 }
 
 Tensor& opt_le_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
diff --git a/kernels/optimized/cpu/op_linear.cpp b/kernels/optimized/cpu/op_linear.cpp
new file mode 100644
index 00000000000..56634d326f2
--- /dev/null
+++ b/kernels/optimized/cpu/op_linear.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/optimized/blas/CPUBlas.h>
+#include <executorch/kernels/portable/cpu/util/matmul_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#include <array>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+
+Tensor& opt_linear_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& mat2,
+    const optional<Tensor>& bias,
+    Tensor& out) {
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      !bias.has_value(),
+      InvalidArgument,
+      out,
+      "bias not supported yet in linear");
+  ET_KERNEL_CHECK(ctx, check_linear_args(in, mat2, out), InvalidArgument, out);
+
+  size_t output_ndim = 0;
+  std::array<exec_aten::SizesType, kTensorDimensionLimit> output_sizes;
+  get_linear_out_target_size(in, mat2, output_sizes.data(), &output_ndim);
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {output_sizes.data(), output_ndim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  // gemm on some platforms doesn't tolerate empty input.
+  if (out.numel() == 0) {
+    return out;
+  }
+
+  int flattened_input_dim = 1;
+  for (int ii = 0; ii < in.dim() - 1; ++ii) {
+    flattened_input_dim *= in.sizes()[ii];
+  }
+  ET_SWITCH_REAL_TYPES_AND2(
+      Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() {
+        size_t n = flattened_input_dim;
+        size_t k = in.sizes()[in.dim() - 1];
+        size_t m = mat2.size(0);
+
+        executorch::cpublas::gemm(
+            executorch::cpublas::TransposeType::Transpose,
+            executorch::cpublas::TransposeType::NoTranspose,
+            m,
+            n,
+            k,
+            static_cast<CTYPE>(1),
+            mat2.const_data_ptr<CTYPE>(),
+            k,
+            in.const_data_ptr<CTYPE>(),
+            k,
+            static_cast<CTYPE>(0),
+            out.mutable_data_ptr<CTYPE>(),
+            m);
+      });
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp
index e681a53d2db..362cd10bee2 100644
--- a/kernels/optimized/cpu/op_log_softmax.cpp
+++ b/kernels/optimized/cpu/op_log_softmax.cpp
@@ -125,7 +125,7 @@ void log_softmax_wrapper(const Tensor& X, int64_t dim, Tensor& out) {
 // _log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out)
 // -> Tensor(a!)
 Tensor& opt_log_softmax_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& self,
     int64_t dim,
     bool half_to_float,
diff --git a/kernels/optimized/cpu/op_mm.cpp b/kernels/optimized/cpu/op_mm.cpp
new file mode 100644
index 00000000000..9131356aeb6
--- /dev/null
+++ b/kernels/optimized/cpu/op_mm.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/optimized/blas/CPUBlas.h>
+#include <executorch/kernels/portable/cpu/util/matmul_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#include <array>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+
+Tensor& opt_mm_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& mat2,
+    Tensor& out) {
+  ET_KERNEL_CHECK(ctx, check_mm_args(in, mat2, out), InvalidArgument, out);
+
+  size_t output_ndim = 0;
+  std::array<exec_aten::SizesType, kTensorDimensionLimit> output_sizes;
+  get_mm_out_target_size(in, mat2, output_sizes.data(), &output_ndim);
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {output_sizes.data(), output_ndim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  if (out.numel() == 0) {
+    return out;
+  }
+  ET_SWITCH_REAL_TYPES_AND2(
+      Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() {
+        size_t n = in.size(0);
+        size_t k = in.size(1);
+        size_t m = mat2.size(1);
+
+        // gemm expects column-major inputs and produces column-major
+        // output. So, we take advantage of the identity (A @ B).t()
+        // = B.t() @ A.t() here; row-major B is B.t() from gemm's
+        // column-major perspective, etc.
+        executorch::cpublas::gemm(
+            executorch::cpublas::TransposeType::NoTranspose,
+            executorch::cpublas::TransposeType::NoTranspose,
+            m,
+            n,
+            k,
+            static_cast<CTYPE>(1),
+            mat2.const_data_ptr<CTYPE>(),
+            m,
+            in.const_data_ptr<CTYPE>(),
+            k,
+            static_cast<CTYPE>(0),
+            out.mutable_data_ptr<CTYPE>(),
+            m);
+      });
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
index adcd8999150..ad6034638f9 100644
--- a/kernels/optimized/cpu/op_mul.cpp
+++ b/kernels/optimized/cpu/op_mul.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/optimized/cpu/binary_ops.h>
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/optimized/vec/vec.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
@@ -22,26 +23,6 @@ using ScalarType = exec_aten::ScalarType;
 
 namespace {
 
-// Move to generic util as this is applicable to all binary ops
-bool can_use_optimized_path(
-    const Tensor& a,
-    const Tensor& b,
-    const Tensor& out) {
-  ScalarType a_type = a.scalar_type();
-  ScalarType b_type = b.scalar_type();
-  ScalarType out_type = out.scalar_type();
-
-  bool can_use_optimized_path = true;
-  can_use_optimized_path =
-      can_use_optimized_path && ((a_type == b_type) && (a_type == out_type));
-  can_use_optimized_path = can_use_optimized_path &&
-      (a_type != ScalarType::Half && b_type != ScalarType::Half);
-  can_use_optimized_path = can_use_optimized_path &&
-      (a.sizes().equals(b.sizes()) ||
-       (a.numel() == b.numel() && a.numel() == out.numel()));
-  return can_use_optimized_path;
-}
-
 template <
     bool can_cast,
     typename CTYPE_A,
@@ -88,7 +69,7 @@ struct MulInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
 } // namespace
 
 Tensor& opt_mul_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -98,7 +79,36 @@ Tensor& opt_mul_out(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
-  if (can_use_optimized_path(a, b, out)) {
+  if (b.numel() == 1) {
+    if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half &&
+        a_type != ScalarType::BFloat16) {
+      ET_KERNEL_CHECK(
+          ctx,
+          resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+          InvalidArgument,
+          out);
+
+      ET_SWITCH_REALB_TYPES(a_type, ctx, "mul.out", CTYPE, [&]() {
+        ET_SWITCH_REALB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
+          CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
+          CTYPE b_casted = static_cast<CTYPE>(b_val);
+
+          using Vec = executorch::vec::Vectorized<CTYPE>;
+          executorch::vec::map<CTYPE>(
+              [b_casted](Vec x) { return x * Vec(b_casted); },
+              out.mutable_data_ptr<CTYPE>(),
+              a.const_data_ptr<CTYPE>(),
+              out.numel());
+        });
+      });
+      return out;
+    }
+  } else if (a.numel() == 1) {
+    return opt_mul_out(ctx, b, a, out);
+  }
+
+  auto selected_optimized_path = select_optimized_path(a, b, out);
+  if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());
     ET_KERNEL_CHECK_MSG(
@@ -117,6 +127,38 @@ Tensor& opt_mul_out(
           b.const_data_ptr<CTYPE>(),
           out.numel());
     });
+  } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
+    const Tensor* lhs;
+    const Tensor* rhs;
+    if (selected_optimized_path ==
+        ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
+      lhs = &b;
+      rhs = &a;
+    } else {
+      // Catch failure to update logic when adding new broadcasting possibility.
+      ET_DCHECK(
+          selected_optimized_path ==
+          ElementwiseOptimizedPath::kBroadcast2dBy1d);
+      lhs = &a;
+      rhs = &b;
+    }
+    auto error = resize_tensor(out, lhs->sizes());
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        error == Error::Ok,
+        InvalidArgument,
+        out,
+        "Failed to resize output tensor.");
+    ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
+      using Vec = executorch::vec::Vectorized<CTYPE>;
+      executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
+          [](Vec x, Vec y) { return x * y; },
+          out.mutable_data_ptr<CTYPE>(),
+          lhs->const_data_ptr<CTYPE>(),
+          rhs->const_data_ptr<CTYPE>(),
+          lhs->sizes()[lhs->dim() - 2],
+          lhs->sizes()[lhs->dim() - 1]);
+    });
   } else {
     ScalarType common_type =
         promoteTypes(a_type, b_type, /*half_to_float*/ true);
@@ -128,12 +170,12 @@ Tensor& opt_mul_out(
         InvalidArgument,
         out);
 
-    ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
-      ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
+      ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
         using CTYPE_IN = typename torch::executor::
             promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
         ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-        ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
+        ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
           apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
               [](const CTYPE_A val_a, const CTYPE_B val_b) {
                 CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
@@ -154,7 +196,7 @@ Tensor& opt_mul_out(
 }
 
 Tensor& opt_mul_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -168,7 +210,7 @@ Tensor& opt_mul_scalar_out(
 
   ET_CHECK(common_type == out_type);
 
-  if (common_type == ScalarType::Half) {
+  if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) {
     common_type = ScalarType::Float;
   }
 
@@ -177,7 +219,7 @@ Tensor& opt_mul_scalar_out(
   ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
 
   if (a_type == common_type && a_type == out_type &&
-      a_type != ScalarType::Half) {
+      a_type != ScalarType::Half && a_type != ScalarType::BFloat16) {
     ET_SWITCH_REALB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE, [&]() {
       ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "mul.Scalar_out", CTYPE_B, [&]() {
         CTYPE_B b_val;
@@ -193,11 +235,11 @@ Tensor& opt_mul_scalar_out(
       });
     });
   } else {
-    ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() {
       ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "mul.Scalar_out", CTYPE_B, [&]() {
         ET_SWITCH_REALB_TYPES(
             common_type, ctx, "mul.Scalar_out", CTYPE_IN, [&]() {
-              ET_SWITCH_REALHB_TYPES(
+              ET_SWITCH_REALHBBF16_TYPES(
                   out_type, ctx, "mul.Scalar_out", CTYPE_OUT, [&]() {
                     CTYPE_B b_val;
                     ET_EXTRACT_SCALAR(b, b_val);
diff --git a/kernels/optimized/cpu/op_native_layer_norm.cpp b/kernels/optimized/cpu/op_native_layer_norm.cpp
index 73f38055856..d04265f3367 100644
--- a/kernels/optimized/cpu/op_native_layer_norm.cpp
+++ b/kernels/optimized/cpu/op_native_layer_norm.cpp
@@ -112,7 +112,7 @@ void layer_norm(
 } // namespace
 
 std::tuple<Tensor&, Tensor&, Tensor&> opt_native_layer_norm_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     IntArrayRef normalized_shape,
     const exec_aten::optional<Tensor>& weight,
diff --git a/kernels/optimized/cpu/op_neg.cpp b/kernels/optimized/cpu/op_neg.cpp
index 861c7aebeed..c46a004e0b3 100644
--- a/kernels/optimized/cpu/op_neg.cpp
+++ b/kernels/optimized/cpu/op_neg.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& opt_neg_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& opt_neg_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   // Resize for dynamic shape
diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp
index 87368f3ed76..ce82e49cc27 100644
--- a/kernels/optimized/cpu/op_sub.cpp
+++ b/kernels/optimized/cpu/op_sub.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/optimized/cpu/binary_ops.h>
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/optimized/vec/vec.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
@@ -71,7 +72,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& opt_sub_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     const Scalar& alpha,
@@ -83,9 +84,62 @@ Tensor& opt_sub_out(
   ScalarType out_type = out.scalar_type();
 
   ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
+  if (a.numel() == 1 || b.numel() == 1) {
+    if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) {
+      const Tensor* tensor;
+      const Tensor* scalar;
+      ScalarType tensor_type;
+      ScalarType scalar_type;
+      if (a.numel() == 1) {
+        tensor = &b;
+        tensor_type = b_type;
+        scalar = &a;
+        scalar_type = a_type;
+      } else {
+        tensor = &a;
+        tensor_type = a_type;
+        scalar = &b;
+        scalar_type = b_type;
+      }
+      ET_KERNEL_CHECK(
+          ctx,
+          resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+          InvalidArgument,
+          out);
+      ET_SWITCH_REAL_TYPES(tensor_type, ctx, "sub.out", CTYPE, [&]() {
+        ET_SWITCH_REAL_TYPES(scalar_type, ctx, "sub.out", CTYPE_SCALAR, [&]() {
+          CTYPE alpha_val;
+          ET_KERNEL_CHECK(
+              ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
+          CTYPE_SCALAR scalar_val = *scalar->const_data_ptr<CTYPE_SCALAR>();
+          CTYPE scalar_casted = static_cast<CTYPE>(scalar_val);
 
-  if (a_type == b_type && a_type == out_type && a.sizes().equals(b.sizes()) &&
-      a_type != ScalarType::Half) {
+          using Vec = executorch::vec::Vectorized<CTYPE>;
+          if (a.numel() == 1) {
+            executorch::vec::map<CTYPE>(
+                [alpha_val, scalar_casted](Vec x) {
+                  return Vec(scalar_casted) - Vec(alpha_val) * x;
+                },
+                out.mutable_data_ptr<CTYPE>(),
+                tensor->const_data_ptr<CTYPE>(),
+                out.numel());
+          } else {
+            executorch::vec::map<CTYPE>(
+                [alpha_val, scalar_casted](Vec x) {
+                  return x - Vec(alpha_val * scalar_casted);
+                },
+                out.mutable_data_ptr<CTYPE>(),
+                tensor->const_data_ptr<CTYPE>(),
+                out.numel());
+          }
+        });
+      });
+    }
+    return out;
+  }
+
+  auto selected_optimized_path = select_optimized_path(a, b, out);
+  if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());
     ET_KERNEL_CHECK_MSG(
@@ -95,7 +149,7 @@ Tensor& opt_sub_out(
         out,
         "Failed to resize output tensor.");
 
-    ET_SWITCH_REAL_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
+    ET_SWITCH_REAL_TYPES(a_type, ctx, "sub.out", CTYPE, [&]() {
       CTYPE alpha_val;
       ET_KERNEL_CHECK(
           ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
@@ -108,6 +162,53 @@ Tensor& opt_sub_out(
           b.const_data_ptr<CTYPE>(),
           out.numel());
     });
+  } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
+    const Tensor* lhs;
+    const Tensor* rhs;
+    if (selected_optimized_path ==
+        ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
+      lhs = &b;
+      rhs = &a;
+    } else {
+      // Catch failure to update logic when subing new broadcasting possibility.
+      ET_DCHECK(
+          selected_optimized_path ==
+          ElementwiseOptimizedPath::kBroadcast2dBy1d);
+      lhs = &a;
+      rhs = &b;
+    }
+    auto error = resize_tensor(out, lhs->sizes());
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        error == Error::Ok,
+        InvalidArgument,
+        out,
+        "Failed to resize output tensor.");
+    ET_SWITCH_REAL_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
+      CTYPE alpha_val;
+      ET_KERNEL_CHECK(
+          ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
+
+      using Vec = executorch::vec::Vectorized<CTYPE>;
+      if (selected_optimized_path ==
+          ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
+        executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
+            [alpha_val](Vec x, Vec y) { return y - Vec(alpha_val) * x; },
+            out.mutable_data_ptr<CTYPE>(),
+            lhs->const_data_ptr<CTYPE>(),
+            rhs->const_data_ptr<CTYPE>(),
+            lhs->sizes()[lhs->dim() - 2],
+            lhs->sizes()[lhs->dim() - 1]);
+      } else {
+        executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
+            [alpha_val](Vec x, Vec y) { return x - Vec(alpha_val) * y; },
+            out.mutable_data_ptr<CTYPE>(),
+            lhs->const_data_ptr<CTYPE>(),
+            rhs->const_data_ptr<CTYPE>(),
+            lhs->sizes()[lhs->dim() - 2],
+            lhs->sizes()[lhs->dim() - 1]);
+      }
+    });
   } else {
     ScalarType common_type =
         promoteTypes(a_type, b_type, /*half_to_float*/ true);
@@ -128,6 +229,7 @@ Tensor& opt_sub_out(
           CTYPE_IN alpha_val;
           ET_KERNEL_CHECK(
               ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
+
           SubInner<
               can_cast<CTYPE_IN, CTYPE_OUT>::value,
               CTYPE_A,
@@ -143,7 +245,7 @@ Tensor& opt_sub_out(
 }
 
 Tensor& opt_sub_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     const Scalar& alpha,
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
index d62568f3130..488d2af7fa1 100644
--- a/kernels/optimized/cpu/targets.bzl
+++ b/kernels/optimized/cpu/targets.bzl
@@ -5,6 +5,7 @@ _OPTIMIZED_ATEN_OPS = (
     op_target(
         name = "op_add",
         deps = [
+            ":binary_ops",
             "//executorch/kernels/portable/cpu:scalar_utils",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],
@@ -18,6 +19,7 @@ _OPTIMIZED_ATEN_OPS = (
     op_target(
         name = "op_div",
         deps = [
+            ":binary_ops",
             "//executorch/kernels/portable/cpu:scalar_utils",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],
@@ -38,6 +40,13 @@ _OPTIMIZED_ATEN_OPS = (
             "//executorch/kernels/portable/cpu:scalar_utils",
         ],
     ),
+    op_target(
+        name = "op_linear",
+        deps = [
+            "//executorch/kernels/optimized:libblas",
+            "//executorch/kernels/portable/cpu/util:matmul_ops_util",
+        ],
+    ),
     op_target(
         name = "op_log_softmax",
         deps = select({
@@ -50,9 +59,17 @@ _OPTIMIZED_ATEN_OPS = (
             ],
         }),
     ),
+    op_target(
+        name = "op_mm",
+        deps = [
+            "//executorch/kernels/optimized:libblas",
+            "//executorch/kernels/portable/cpu/util:matmul_ops_util",
+        ],
+    ),
     op_target(
         name = "op_mul",
         deps = [
+            ":binary_ops",
             "//executorch/kernels/portable/cpu:scalar_utils",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],
@@ -68,6 +85,7 @@ _OPTIMIZED_ATEN_OPS = (
     op_target(
         name = "op_sub",
         deps = [
+            ":binary_ops",
             "//executorch/kernels/portable/cpu:scalar_utils",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],
@@ -90,6 +108,13 @@ def define_common_targets():
     aten_op_targets = [":{}".format(op["name"]) for op in enabled_ops]
     all_op_targets = aten_op_targets
 
+    runtime.cxx_library(
+        name = "binary_ops",
+        exported_headers = ["binary_ops.h"],
+        visibility = ["//executorch/kernels/optimized/cpu/..."],
+        exported_deps = ["//executorch/runtime/core:core"],
+    )
+
     runtime.cxx_library(
         name = "cpu_optimized",
         srcs = [],
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
index 5af9b423ad0..367c23f0813 100644
--- a/kernels/optimized/lib_defs.bzl
+++ b/kernels/optimized/lib_defs.bzl
@@ -1,4 +1,6 @@
 load("@fbsource//tools/build_defs:default_platform_defs.bzl", "DEVSERVER_PLATFORM_REGEX")
+load("@fbsource//tools/build_defs:fb_native_wrapper.bzl", "fb_native")
+load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 # Because vec exists as a collection of header files, compile and preprocessor
@@ -99,44 +101,67 @@ def define_libs():
         ],
     )
 
-    runtime.cxx_library(
-        name = "libblas",
-        srcs = native.glob([
-            "blas/**/*.cpp",
-        ]),
-        exported_headers = native.glob([
-            "blas/**/*.h",
-        ]),
-        header_namespace = "executorch/kernels/optimized",
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        fbandroid_platform_preprocessor_flags = [
-            (
-                "^android-arm64.*$",
-                [
-                    "-DET_BUILD_WITH_BLAS",
-                ],
-            ),
-        ],
-        fbandroid_platform_deps = [
-            (
-                "^android-arm64.*$",
-                [
-                    "fbsource//third-party/openblas:openblas",
-                ],
-            ),
-        ],
-        fbobjc_exported_preprocessor_flags = [
-            "-DET_BUILD_WITH_BLAS",
-            "-DET_BUILD_FOR_APPLE",
-        ],
-        fbobjc_frameworks = [
-            "Accelerate",
-        ],
-        exported_deps = [
-            "//executorch/kernels/optimized:libutils",
-            "//executorch/runtime/core/exec_aten:lib",
+    # OSS doesn't have ovr_config//os:linux-x86_64
+    fb_native.config_setting(
+        name = "linux-x86_64",
+        constraint_values = [
+            "ovr_config//os/constraints:linux",
+            "ovr_config//cpu/constraints:x86_64",
         ],
     )
+
+    LIBBLAS_DEPS = [third_party_dep("cpuinfo")]
+
+    for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]:
+        runtime.cxx_library(
+            name = libblas_name,
+            srcs = native.glob([
+                "blas/**/*.cpp",
+            ]),
+            exported_headers = native.glob([
+                "blas/**/*.h",
+            ]),
+            header_namespace = "executorch/kernels/optimized",
+            visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+            preprocessor_flags = select({
+                ":linux-x86_64": [
+                    "-DET_BUILD_WITH_BLAS",
+                ] if not runtime.is_oss else [],
+                "DEFAULT": [],
+            }),
+            fbandroid_platform_preprocessor_flags = [
+                (
+                    "^android-arm64.*$",
+                    [
+                        "-DET_BUILD_WITH_BLAS",
+                    ],
+                ),
+            ],
+            fbandroid_platform_deps = [
+                (
+                    "^android-arm64.*$",
+                    [
+                        "fbsource//third-party/openblas:openblas",
+                    ],
+                ),
+            ],
+            fbobjc_exported_preprocessor_flags = [
+                "-DET_BUILD_WITH_BLAS",
+                "-DET_BUILD_FOR_APPLE",
+            ],
+            fbobjc_frameworks = [
+                "Accelerate",
+            ],
+            deps = select({
+                ":linux-x86_64": [mkl_dep] if not runtime.is_oss else [],
+                "DEFAULT": [],
+            }) + LIBBLAS_DEPS,
+            exported_deps = [
+                "//executorch/extension/parallel:thread_parallel",
+                "//executorch/kernels/optimized:libutils",
+                "//executorch/runtime/core/exec_aten:lib",
+            ],
+        )
diff --git a/kernels/optimized/optimized-oss.yaml b/kernels/optimized/optimized-oss.yaml
index f79d652b91d..797744f3bd4 100644
--- a/kernels/optimized/optimized-oss.yaml
+++ b/kernels/optimized/optimized-oss.yaml
@@ -45,6 +45,11 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_le_tensor_out
 
+- op: linear.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_linear_out
+
 - op: mul.out
   kernels:
     - arg_meta: null
diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml
index 0d445deb3e8..2421673f8a7 100644
--- a/kernels/optimized/optimized.yaml
+++ b/kernels/optimized/optimized.yaml
@@ -52,6 +52,16 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_le_tensor_out
 
+- op: linear.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_linear_out
+
+- op: mm.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_mm_out
+
 - op: mul.out
   kernels:
     - arg_meta: null
diff --git a/kernels/optimized/targets.bzl b/kernels/optimized/targets.bzl
index c06a1dc079d..88afe5011d3 100644
--- a/kernels/optimized/targets.bzl
+++ b/kernels/optimized/targets.bzl
@@ -19,6 +19,14 @@ def define_common_targets():
         ],
     )
 
+    runtime.export_file(
+        name = "optimized-oss.yaml",
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     runtime.cxx_library(
         name = "optimized_operators",
         srcs = [],
diff --git a/kernels/optimized/test/libblas_test.cpp b/kernels/optimized/test/libblas_test.cpp
index 8f30a357e1a..24aeaba776a 100644
--- a/kernels/optimized/test/libblas_test.cpp
+++ b/kernels/optimized/test/libblas_test.cpp
@@ -9,6 +9,7 @@
 #include <gtest/gtest.h>
 
 #include <executorch/kernels/optimized/blas/CPUBlas.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 
 #include <vector>
 
@@ -17,7 +18,8 @@
   _<float, N>();                           \
   _<int64_t, N>();                         \
   _<uint8_t, N>();                         \
-  _<int32_t, N>();
+  _<int32_t, N>();                         \
+  _<exec_aten::BFloat16, N>();
 
 namespace {
 
diff --git a/kernels/optimized/vec/functional_base.h b/kernels/optimized/vec/functional_base.h
index 50141c2ec3f..7edb043abc9 100644
--- a/kernels/optimized/vec/functional_base.h
+++ b/kernels/optimized/vec/functional_base.h
@@ -325,5 +325,40 @@ inline void map4(
   }
 }
 
+
+// Map vec_fun across input_data and input_data2, where input_data is
+// a two-dimensional array of size (size, size2), input_data2 is a
+// one-dimensional array of size size2, and input_data2 is broadcast
+// to be of size (size, size2).
+template <typename scalar_t, typename Op>
+inline void broadcasting_map_2d_by_1d(
+    const Op& vec_fun,
+    scalar_t* output_data,
+    const scalar_t* input_data,
+    const scalar_t* input_data2,
+    int64_t size,
+    int64_t size2) {
+  using Vec = vec::Vectorized<scalar_t>;
+  for (int64_t outer_idx = 0; outer_idx < size; ++outer_idx) {
+    const scalar_t* input_data_row = input_data + outer_idx * size2;
+    scalar_t* output_data_row = output_data + outer_idx * size2;
+    int64_t inner_idx = 0;
+    for (; inner_idx < size2 - (size2 % Vec::size()); inner_idx += Vec::size()) {
+      Vec data_vec = Vec::loadu(input_data_row + inner_idx);
+      Vec data_vec2 = Vec::loadu(input_data2 + inner_idx);
+      Vec output_vec = vec_fun(data_vec, data_vec2);
+      output_vec.store(output_data_row + inner_idx);
+    }
+    if (size2 - inner_idx > 0) {
+      Vec data_vec = Vec::loadu(input_data_row + inner_idx, size2 - inner_idx);
+      Vec data_vec2 = Vec::loadu(input_data2 + inner_idx, size2 - inner_idx);
+      Vec output_vec = vec_fun(data_vec, data_vec2);
+      output_vec.store(output_data_row + inner_idx, size2 - inner_idx);
+    }
+  }
+}
+
+
+
 } // namespace vec
 } // namespace executorch
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index eb3cedd5b3f..885c509246b 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -38,12 +38,11 @@ list(FILTER _portable_kernels__srcs EXCLUDE REGEX "test/*.cpp")
 list(FILTER _portable_kernels__srcs EXCLUDE REGEX "codegen")
 # Generate C++ bindings to register kernels into both PyTorch (for AOT) and
 # Executorch (for runtime). Here select all ops in functions.yaml
-set(_yaml "${CMAKE_CURRENT_LIST_DIR}/functions.yaml")
+set(_yaml "${CMAKE_CURRENT_SOURCE_DIR}/functions.yaml")
 gen_selected_ops(LIB_NAME "portable_ops_lib" OPS_SCHEMA_YAML "${_yaml}")
 # Expect gen_selected_ops output file to be selected_operators.yaml
 generate_bindings_for_kernels(
-  LIB_NAME "portable_ops_lib" FUNCTIONS_YAML
-  ${CMAKE_CURRENT_SOURCE_DIR}/functions.yaml
+  LIB_NAME "portable_ops_lib" FUNCTIONS_YAML "${_yaml}"
 )
 message("Generated files ${gen_command_sources}")
 
diff --git a/kernels/portable/cpu/op__to_dim_order_copy.cpp b/kernels/portable/cpu/op__to_dim_order_copy.cpp
index c7941f6098a..31dd4fbb9df 100644
--- a/kernels/portable/cpu/op__to_dim_order_copy.cpp
+++ b/kernels/portable/cpu/op__to_dim_order_copy.cpp
@@ -78,7 +78,7 @@ void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
 // _to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]?
 // dim_order=None, Tensor(a!) out) -> Tensor(a!)
 Tensor& _to_dim_order_copy_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& self,
     bool non_blocking,
     OptionalArrayRef<int64_t> dim_order,
@@ -118,7 +118,7 @@ Tensor& _to_dim_order_copy_out(
     bool non_blocking,
     OptionalArrayRef<int64_t> dim_order,
     Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return _to_dim_order_copy_out(context, self, non_blocking, dim_order, out);
 }
 
diff --git a/kernels/portable/cpu/op_abs.cpp b/kernels/portable/cpu/op_abs.cpp
index 0dd925a0e25..5d0fcbaaa45 100644
--- a/kernels/portable/cpu/op_abs.cpp
+++ b/kernels/portable/cpu/op_abs.cpp
@@ -16,7 +16,7 @@ namespace native {
 
 using exec_aten::Tensor;
 
-Tensor& abs_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& abs_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   // Resize for dynamic shape
@@ -28,6 +28,8 @@ Tensor& abs_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
       "Failed to resize output tensor.");
 
   ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
   ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "abs.out", CTYPE, [&] {
     apply_unary_map_fn(
diff --git a/kernels/portable/cpu/op_acos.cpp b/kernels/portable/cpu/op_acos.cpp
index 006d8f60538..46f9f965bb9 100644
--- a/kernels/portable/cpu/op_acos.cpp
+++ b/kernels/portable/cpu/op_acos.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& acos_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& acos_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::acos, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_acosh.cpp b/kernels/portable/cpu/op_acosh.cpp
index f688829beba..d1d3b0aa232 100644
--- a/kernels/portable/cpu/op_acosh.cpp
+++ b/kernels/portable/cpu/op_acosh.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& acosh_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& acosh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::acosh, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp
index 33662ecc55a..8b6dbec1a45 100644
--- a/kernels/portable/cpu/op_add.cpp
+++ b/kernels/portable/cpu/op_add.cpp
@@ -67,7 +67,7 @@ struct AddInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
 } // namespace
 
 Tensor& add_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     const Scalar& alpha,
@@ -78,7 +78,13 @@ Tensor& add_out(
       InvalidArgument,
       out);
 
-  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_realhbbf16_type(out),
+      InvalidArgument,
+      out);
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
@@ -92,15 +98,15 @@ Tensor& add_out(
 
   constexpr auto name = "add.out";
 
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
-    ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
       using CTYPE_IN = typename torch::executor::
           promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
       ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
       CTYPE_IN alpha_val;
       utils::extract_scalar(alpha, &alpha_val);
 
-      ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+      ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
         AddInner<
             can_cast<CTYPE_IN, CTYPE_OUT>::value,
             CTYPE_A,
@@ -115,7 +121,7 @@ Tensor& add_out(
 }
 
 Tensor& add_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     const Scalar& alpha,
@@ -130,7 +136,13 @@ Tensor& add_scalar_out(
       out,
       "Failed to resize output tensor.");
 
-  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_realhbbf16_type(out),
+      InvalidArgument,
+      out);
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
@@ -149,7 +161,7 @@ Tensor& add_scalar_out(
 
   constexpr auto name = "add.Scalar_out";
 
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
     ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
       using CTYPE_IN = typename utils::promote_type_with_scalar_type<
           CTYPE_A,
diff --git a/kernels/portable/cpu/op_addmm.cpp b/kernels/portable/cpu/op_addmm.cpp
index 2f4745bb519..952c233024f 100644
--- a/kernels/portable/cpu/op_addmm.cpp
+++ b/kernels/portable/cpu/op_addmm.cpp
@@ -20,7 +20,7 @@ using Tensor = exec_aten::Tensor;
 using Scalar = exec_aten::Scalar;
 
 Tensor& addmm_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const Tensor& mat1,
     const Tensor& mat2,
@@ -45,6 +45,14 @@ Tensor& addmm_out(
   ET_KERNEL_CHECK(
       ctx, tensor_is_broadcastable_to(in, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx,
+      tensors_have_same_dim_order(in, mat1, mat2, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   ScalarType alpha_dtype = utils::get_scalar_dtype(alpha);
   ScalarType beta_dtype = utils::get_scalar_dtype(beta);
   ET_SWITCH_REAL_TYPES_AND(
diff --git a/kernels/portable/cpu/op_alias_copy.cpp b/kernels/portable/cpu/op_alias_copy.cpp
index 72fd945e984..a532b47c25e 100644
--- a/kernels/portable/cpu/op_alias_copy.cpp
+++ b/kernels/portable/cpu/op_alias_copy.cpp
@@ -16,7 +16,8 @@ namespace native {
 
 using Tensor = exec_aten::Tensor;
 
-Tensor& alias_copy_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor&
+alias_copy_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   // Resize for dynamic shape
@@ -28,6 +29,8 @@ Tensor& alias_copy_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
       "Failed to resize output tensor.");
 
   ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
   if (in.nbytes() > 0) {
     // Note that this check is important. It's valid for a tensor with numel 0
diff --git a/kernels/portable/cpu/op_allclose.cpp b/kernels/portable/cpu/op_allclose.cpp
index dc9fea082b6..a896d9a8523 100644
--- a/kernels/portable/cpu/op_allclose.cpp
+++ b/kernels/portable/cpu/op_allclose.cpp
@@ -104,6 +104,9 @@ Tensor& allclose_out(
       out.scalar_type() == ScalarType::Bool,
       "Out tensor must be type Bool; saw type %" PRId8,
       static_cast<int8_t>(out.scalar_type()));
+  ET_CHECK_MSG(
+      tensors_have_same_dim_order(self, other, out),
+      "self, other and out tensors should have same dim order");
   ET_CHECK_MSG(
       out.numel() == 1,
       "Out tensor must be a single element; saw %zu elements",
@@ -143,7 +146,7 @@ Tensor allclose_tensor(
 }
 
 Tensor& allclose_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& self,
     const Tensor& other,
     double rtol,
@@ -158,7 +161,7 @@ Tensor& allclose_out(
 }
 
 Tensor allclose_tensor(
-    ET_UNUSED RuntimeContext& ctx,
+    ET_UNUSED KernelRuntimeContext& ctx,
     ET_UNUSED const Tensor& self,
     ET_UNUSED const Tensor& other,
     ET_UNUSED double rtol,
diff --git a/kernels/portable/cpu/op_amax.cpp b/kernels/portable/cpu/op_amax.cpp
index d3994f1efd5..088c30a3759 100644
--- a/kernels/portable/cpu/op_amax.cpp
+++ b/kernels/portable/cpu/op_amax.cpp
@@ -20,7 +20,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& amax_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     ArrayRef<int64_t> dim_list,
     bool keepdim,
@@ -39,6 +39,9 @@ Tensor& amax_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES_AND(
       Bool, in.scalar_type(), ctx, "amax.out", CTYPE, [&]() {
         CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
diff --git a/kernels/portable/cpu/op_amin.cpp b/kernels/portable/cpu/op_amin.cpp
index f150e0eb7d6..9f2aa38c895 100644
--- a/kernels/portable/cpu/op_amin.cpp
+++ b/kernels/portable/cpu/op_amin.cpp
@@ -20,7 +20,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& amin_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     ArrayRef<int64_t> dim_list,
     bool keepdim,
@@ -39,6 +39,9 @@ Tensor& amin_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES_AND(
       Bool, in.scalar_type(), ctx, "amin.out", CTYPE, [&]() {
         CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
diff --git a/kernels/portable/cpu/op_any.cpp b/kernels/portable/cpu/op_any.cpp
index ca942475514..e2bd8ebe199 100644
--- a/kernels/portable/cpu/op_any.cpp
+++ b/kernels/portable/cpu/op_any.cpp
@@ -16,12 +16,15 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
-Tensor& any_all_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& any_all_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, {}) == Error::Ok, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ScalarType in_type = in.scalar_type();
   ScalarType out_type = out.scalar_type();
   constexpr auto name = "any.all_out";
@@ -44,7 +47,7 @@ Tensor& any_all_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
 }
 
 Tensor& any_dims_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     optional<ArrayRef<int64_t>> dim_list,
     bool keepdim,
@@ -68,6 +71,9 @@ Tensor& any_dims_out(
         out);
   }
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ScalarType in_type = in.scalar_type();
   ScalarType out_type = out.scalar_type();
   constexpr auto name = "any.dims_out";
@@ -102,7 +108,7 @@ Tensor& any_dims_out(
 }
 
 Tensor& any_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
     bool keepdim,
@@ -122,6 +128,9 @@ Tensor& any_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ScalarType in_type = in.scalar_type();
   ScalarType out_type = out.scalar_type();
   constexpr auto name = "any.out";
diff --git a/kernels/portable/cpu/op_arange.cpp b/kernels/portable/cpu/op_arange.cpp
index a8fc3ce96ab..80a8142117e 100644
--- a/kernels/portable/cpu/op_arange.cpp
+++ b/kernels/portable/cpu/op_arange.cpp
@@ -19,7 +19,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& arange_out(RuntimeContext& ctx, const Scalar& end, Tensor& out) {
+Tensor& arange_out(KernelRuntimeContext& ctx, const Scalar& end, Tensor& out) {
   double end_val = 0;
   ET_KERNEL_CHECK(
       ctx, utils::extract_scalar(end, &end_val), InvalidArgument, out);
@@ -27,6 +27,8 @@ Tensor& arange_out(RuntimeContext& ctx, const Scalar& end, Tensor& out) {
   ET_KERNEL_CHECK(
       ctx, check_arange_args(0.0, end_val, 1.0, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(out), InvalidArgument, out);
+
   size_t size = static_cast<size_t>(std::ceil(end_val));
 
   Tensor::SizesType out_length = static_cast<Tensor::SizesType>(size);
@@ -48,7 +50,7 @@ Tensor& arange_out(RuntimeContext& ctx, const Scalar& end, Tensor& out) {
 }
 
 Tensor& arange_start_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Scalar& start,
     const Scalar& end,
     const Scalar& step,
@@ -73,6 +75,8 @@ Tensor& arange_start_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(out), InvalidArgument, out);
+
   double size_d = (d_end - d_start) / d_step;
   size_t size = static_cast<size_t>(std::ceil(size_d));
 
diff --git a/kernels/portable/cpu/op_argmax.cpp b/kernels/portable/cpu/op_argmax.cpp
index c59be11bc08..b8adb43ac3c 100644
--- a/kernels/portable/cpu/op_argmax.cpp
+++ b/kernels/portable/cpu/op_argmax.cpp
@@ -21,7 +21,7 @@ using exec_aten::optional;
 using exec_aten::Tensor;
 
 Tensor& argmax_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     optional<int64_t> dim,
     bool keepdim,
@@ -40,6 +40,9 @@ Tensor& argmax_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "argmax.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index e0609c9e5ce..fbeafb8d335 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -21,7 +21,7 @@ using exec_aten::optional;
 using exec_aten::Tensor;
 
 Tensor& argmin_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     optional<int64_t> dim,
     bool keepdim,
@@ -40,6 +40,9 @@ Tensor& argmin_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
diff --git a/kernels/portable/cpu/op_as_strided_copy.cpp b/kernels/portable/cpu/op_as_strided_copy.cpp
index 17ebc410878..d18323b92fc 100644
--- a/kernels/portable/cpu/op_as_strided_copy.cpp
+++ b/kernels/portable/cpu/op_as_strided_copy.cpp
@@ -17,7 +17,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& as_strided_copy_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     ArrayRef<int64_t> size,
     ArrayRef<int64_t> stride,
@@ -37,6 +37,11 @@ Tensor& as_strided_copy_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   if (in.numel() == 0) {
     return out;
   }
diff --git a/kernels/portable/cpu/op_asin.cpp b/kernels/portable/cpu/op_asin.cpp
index 7c69c062309..0200009b330 100644
--- a/kernels/portable/cpu/op_asin.cpp
+++ b/kernels/portable/cpu/op_asin.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& asin_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& asin_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::asin, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_asinh.cpp b/kernels/portable/cpu/op_asinh.cpp
index 471d54f0930..17336618a19 100644
--- a/kernels/portable/cpu/op_asinh.cpp
+++ b/kernels/portable/cpu/op_asinh.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& asinh_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& asinh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::asinh, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_atan.cpp b/kernels/portable/cpu/op_atan.cpp
index ad2e355e052..0c980c6a785 100644
--- a/kernels/portable/cpu/op_atan.cpp
+++ b/kernels/portable/cpu/op_atan.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& atan_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& atan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::atan, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp
index 36ba53830b5..bae4106f997 100644
--- a/kernels/portable/cpu/op_atan2.cpp
+++ b/kernels/portable/cpu/op_atan2.cpp
@@ -17,8 +17,11 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
-Tensor&
-atan2_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
+Tensor& atan2_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
   // Determine output size and resize for dynamic shapes
   ET_KERNEL_CHECK(
       ctx,
@@ -26,6 +29,9 @@ atan2_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
diff --git a/kernels/portable/cpu/op_atanh.cpp b/kernels/portable/cpu/op_atanh.cpp
index 368eb507cdd..2c13fb6efd8 100644
--- a/kernels/portable/cpu/op_atanh.cpp
+++ b/kernels/portable/cpu/op_atanh.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& atanh_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& atanh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::atanh, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_avg_pool2d.cpp b/kernels/portable/cpu/op_avg_pool2d.cpp
index 42e3cd7fca8..6d479bea469 100644
--- a/kernels/portable/cpu/op_avg_pool2d.cpp
+++ b/kernels/portable/cpu/op_avg_pool2d.cpp
@@ -21,7 +21,7 @@ using ScalarType = exec_aten::ScalarType;
 using IntArrayRef = exec_aten::ArrayRef<int64_t>;
 
 Tensor& avg_pool2d_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     IntArrayRef kernel_size,
     IntArrayRef stride,
@@ -44,6 +44,11 @@ Tensor& avg_pool2d_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   size_t output_ndim = 0;
   exec_aten::SizesType output_sizes[kTensorDimensionLimit];
   get_avg_pool2d_out_target_size(
diff --git a/kernels/portable/cpu/op_bitwise_and.cpp b/kernels/portable/cpu/op_bitwise_and.cpp
index de137afbec2..92ba2c024d6 100644
--- a/kernels/portable/cpu/op_bitwise_and.cpp
+++ b/kernels/portable/cpu/op_bitwise_and.cpp
@@ -22,7 +22,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& bitwise_and_Tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -32,6 +32,9 @@ Tensor& bitwise_and_Tensor_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type);
@@ -68,7 +71,7 @@ Tensor& bitwise_and_Tensor_out(
 }
 
 Tensor& bitwise_and_Scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -82,6 +85,9 @@ Tensor& bitwise_and_Scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType common_type = utils::promote_type_with_scalar(a_type, b);
diff --git a/kernels/portable/cpu/op_bitwise_not.cpp b/kernels/portable/cpu/op_bitwise_not.cpp
index 157d2fd173c..ccbfc0b74b4 100644
--- a/kernels/portable/cpu/op_bitwise_not.cpp
+++ b/kernels/portable/cpu/op_bitwise_not.cpp
@@ -21,7 +21,8 @@ using exec_aten::Tensor;
  * Computes the bitwise NOT of the given input tensor. The input tensor must be
  * of Integral or Boolean types. For bool tensors, it computes the logical NOT.
  **/
-Tensor& bitwise_not_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor&
+bitwise_not_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   // Resize for dynamic shape
@@ -33,6 +34,8 @@ Tensor& bitwise_not_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
       "Failed to resize output tensor.");
 
   ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
   if (in.scalar_type() == exec_aten::ScalarType::Bool) {
     apply_unary_map_fn(
diff --git a/kernels/portable/cpu/op_bitwise_or.cpp b/kernels/portable/cpu/op_bitwise_or.cpp
index 39707de07ce..5fdd02f3aec 100644
--- a/kernels/portable/cpu/op_bitwise_or.cpp
+++ b/kernels/portable/cpu/op_bitwise_or.cpp
@@ -22,7 +22,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& bitwise_or_Tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -32,6 +32,9 @@ Tensor& bitwise_or_Tensor_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type);
@@ -68,12 +71,15 @@ Tensor& bitwise_or_Tensor_out(
 }
 
 Tensor& bitwise_or_Scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
   (void)ctx;
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
       ctx,
diff --git a/kernels/portable/cpu/op_bitwise_xor.cpp b/kernels/portable/cpu/op_bitwise_xor.cpp
index 1855485ee52..aed9429d6b2 100644
--- a/kernels/portable/cpu/op_bitwise_xor.cpp
+++ b/kernels/portable/cpu/op_bitwise_xor.cpp
@@ -22,7 +22,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& bitwise_xor_Tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -32,6 +32,9 @@ Tensor& bitwise_xor_Tensor_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type);
@@ -68,7 +71,7 @@ Tensor& bitwise_xor_Tensor_out(
 }
 
 Tensor& bitwise_xor_Scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -82,6 +85,9 @@ Tensor& bitwise_xor_Scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType common_type = utils::promote_type_with_scalar(a_type, b);
diff --git a/kernels/portable/cpu/op_bmm.cpp b/kernels/portable/cpu/op_bmm.cpp
index e36a4c2e413..8ab7a1b8ecf 100644
--- a/kernels/portable/cpu/op_bmm.cpp
+++ b/kernels/portable/cpu/op_bmm.cpp
@@ -17,12 +17,17 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& bmm_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const Tensor& mat2,
     Tensor& out) {
   ET_KERNEL_CHECK(ctx, check_bmm_args(in, mat2, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, mat2, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   size_t output_ndim = 0;
   exec_aten::SizesType output_sizes[kTensorDimensionLimit];
   get_bmm_out_target_size(in, mat2, output_sizes, &output_ndim);
diff --git a/kernels/portable/cpu/op_cat.cpp b/kernels/portable/cpu/op_cat.cpp
index 8ad4556a11d..566937caf1b 100644
--- a/kernels/portable/cpu/op_cat.cpp
+++ b/kernels/portable/cpu/op_cat.cpp
@@ -18,7 +18,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& cat_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     exec_aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
diff --git a/kernels/portable/cpu/op_cdist_forward.cpp b/kernels/portable/cpu/op_cdist_forward.cpp
index 5f435806926..657de86ac1a 100644
--- a/kernels/portable/cpu/op_cdist_forward.cpp
+++ b/kernels/portable/cpu/op_cdist_forward.cpp
@@ -116,7 +116,7 @@ void cdist(const Tensor& x1, const Tensor& x2, Tensor& out, double p) {
 } // namespace
 
 Tensor& _cdist_forward_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& x1,
     const Tensor& x2,
     double p,
@@ -124,6 +124,11 @@ Tensor& _cdist_forward_out(
     Tensor& out) {
   (void)ctx;
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(x1, x2, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(x1), InvalidArgument, out);
+
   ET_KERNEL_CHECK(
       ctx,
       check_cdist_args(x1, x2, p, compute_mode, out),
diff --git a/kernels/portable/cpu/op_ceil.cpp b/kernels/portable/cpu/op_ceil.cpp
index d4a24a36af5..f6e9951d5e1 100644
--- a/kernels/portable/cpu/op_ceil.cpp
+++ b/kernels/portable/cpu/op_ceil.cpp
@@ -16,7 +16,7 @@ namespace native {
 
 using exec_aten::Tensor;
 
-Tensor& ceil_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& ceil_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realh(std::ceil, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp
index 33a9fe899db..ec34fa9bd35 100644
--- a/kernels/portable/cpu/op_clamp.cpp
+++ b/kernels/portable/cpu/op_clamp.cpp
@@ -69,7 +69,7 @@ ET_NODISCARD bool check_bounds(
 } // namespace
 
 Tensor& clamp_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const exec_aten::optional<Scalar>& min_opt,
     const exec_aten::optional<Scalar>& max_opt,
@@ -83,6 +83,9 @@ Tensor& clamp_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ScalarType in_type = in.scalar_type();
   ScalarType min_type = in_type;
   ScalarType max_type = in_type;
@@ -162,7 +165,7 @@ Tensor& clamp_out(
 }
 
 Tensor& clamp_tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const exec_aten::optional<Tensor>& min_opt,
     const exec_aten::optional<Tensor>& max_opt,
@@ -182,6 +185,12 @@ Tensor& clamp_tensor_out(
   const Tensor& min = has_min ? min_opt.value() : in;
   const Tensor& max = has_max ? max_opt.value() : in;
 
+  ET_KERNEL_CHECK(
+      ctx,
+      tensors_have_same_dim_order(in, min, max, out),
+      InvalidArgument,
+      out);
+
   ET_KERNEL_CHECK(
       ctx,
       resize_to_broadcast_target_size(in, min, max, out) == Error::Ok,
diff --git a/kernels/portable/cpu/op_clone.cpp b/kernels/portable/cpu/op_clone.cpp
index a49f4169dbf..4350cee4328 100644
--- a/kernels/portable/cpu/op_clone.cpp
+++ b/kernels/portable/cpu/op_clone.cpp
@@ -19,7 +19,7 @@ using Tensor = exec_aten::Tensor;
 // clone.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out)
 // -> Tensor(a!)
 Tensor& clone_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& self,
     exec_aten::optional<exec_aten::MemoryFormat> memory_format,
     Tensor& out) {
@@ -38,6 +38,9 @@ Tensor& clone_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      context, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
   // Right now we only focus on contiguous memory, memory_format shall always
   // either a nullopt or exec::aten::MemoryFormat::Contiguous
   ET_KERNEL_CHECK(
diff --git a/kernels/portable/cpu/op_constant_pad_nd.cpp b/kernels/portable/cpu/op_constant_pad_nd.cpp
index 32a3985b29e..28b0c4b034b 100644
--- a/kernels/portable/cpu/op_constant_pad_nd.cpp
+++ b/kernels/portable/cpu/op_constant_pad_nd.cpp
@@ -160,7 +160,7 @@ void constant_pad_nd_out_impl(
 } // namespace
 
 Tensor& constant_pad_nd_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     IntArrayRef pad,
     const Scalar& value,
@@ -170,6 +170,9 @@ Tensor& constant_pad_nd_out(
   ET_KERNEL_CHECK(
       ctx, check_constant_pad_args(in, pad, value, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   // resize out tensor for dynamic shapes
   ET_KERNEL_CHECK_MSG(
       ctx,
diff --git a/kernels/portable/cpu/op_convolution.cpp b/kernels/portable/cpu/op_convolution.cpp
index 81a0747e454..af6b164f301 100644
--- a/kernels/portable/cpu/op_convolution.cpp
+++ b/kernels/portable/cpu/op_convolution.cpp
@@ -336,7 +336,7 @@ void convolution_wrapper(
 } // namespace
 
 Tensor& convolution_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const Tensor& weight,
     const exec_aten::optional<Tensor>& bias,
@@ -365,6 +365,9 @@ Tensor& convolution_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   size_t output_ndim = 0;
   exec_aten::SizesType output_sizes[kTensorDimensionLimit];
   get_convolution_out_target_size(
diff --git a/kernels/portable/cpu/op_convolution_backward.cpp b/kernels/portable/cpu/op_convolution_backward.cpp
new file mode 100644
index 00000000000..6e0a93aade5
--- /dev/null
+++ b/kernels/portable/cpu/op_convolution_backward.cpp
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstring>
+
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <tuple>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using ScalarType = exec_aten::ScalarType;
+using IntArrayRef = exec_aten::ArrayRef<int64_t>;
+using OptIntArrayRef = exec_aten::OptionalArrayRef<int64_t>;
+
+namespace {
+
+bool check_convolution_backward_args(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& weight,
+    ET_UNUSED const OptIntArrayRef bias_sizes_opt,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool transposed,
+    IntArrayRef output_padding,
+    int64_t groups,
+    ET_UNUSED exec_aten::ArrayRef<bool> output_mask,
+    Tensor& grad_input,
+    Tensor& grad_weight,
+    Tensor& grad_bias) {
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      transposed == false, "Transposed Convolution Backward not supported yet");
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      weight.dim() == 4, "Only 2D Convolution Backward supported for now");
+
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(weight, input));
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(grad_output, input));
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(grad_input, input));
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(grad_weight, input));
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(grad_bias, input));
+
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      check_convolution_args(
+          input,
+          weight,
+          exec_aten::optional<Tensor>(),
+          stride,
+          padding,
+          dilation,
+          transposed,
+          output_padding,
+          groups,
+          grad_output),
+      "Invalid convolution arguments");
+
+  size_t output_ndim = 0;
+  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  get_convolution_out_target_size(
+      input,
+      weight,
+      stride,
+      padding,
+      dilation,
+      transposed,
+      output_padding,
+      groups,
+      output_sizes,
+      &output_ndim);
+
+  ET_LOG_AND_RETURN_IF_FALSE(
+      output_size_is_valid({output_sizes, output_ndim}, input.dim() - 2));
+
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      grad_output.dim() == input.dim(),
+      "grad_output should have same number of dimensions as input");
+
+  ET_LOG_AND_RETURN_IF_FALSE(
+      tensor_has_expected_size(grad_output, {output_sizes, output_ndim}));
+
+  return true;
+}
+
+template <typename CTYPE>
+void conv2d_backward_impl(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    exec_aten::ArrayRef<bool> output_mask,
+    Tensor& grad_input,
+    Tensor& grad_weight,
+    Tensor& grad_bias) {
+  auto batch_size = input.size(0);
+  auto in_channels = input.size(1);
+  auto out_channels = weight.size(0);
+  auto in_height = input.size(2);
+  auto in_width = input.size(3);
+  auto out_height = grad_output.size(2);
+  auto out_width = grad_output.size(3);
+  auto kernel_height = weight.size(2);
+  auto kernel_width = weight.size(3);
+
+  const int64_t stride_h = val_at(stride, 0);
+  const int64_t padding_h = val_at(padding, 0, /*default_value=*/0);
+  const int64_t dilation_h = val_at(dilation, 0);
+  const int64_t stride_w = val_at(stride, 1);
+  const int64_t padding_w = val_at(padding, 1, /*default_value=*/0);
+  const int64_t dilation_w = val_at(dilation, 1);
+
+  auto in_channels_per_group = in_channels / groups;
+  auto out_channels_per_group = out_channels / groups;
+
+  const CTYPE* grad_output_data = grad_output.const_data_ptr<CTYPE>();
+  const CTYPE* input_data = input.const_data_ptr<CTYPE>();
+  const CTYPE* weight_data = weight.const_data_ptr<CTYPE>();
+
+  CTYPE* grad_input_data = nullptr;
+  CTYPE* grad_weight_data = nullptr;
+  CTYPE* grad_bias_data = nullptr;
+
+  if (output_mask[0]) {
+    grad_input_data = grad_input.mutable_data_ptr<CTYPE>();
+    memset(grad_input_data, 0, grad_input.nbytes());
+  }
+
+  if (output_mask[1]) {
+    grad_weight_data = grad_weight.mutable_data_ptr<CTYPE>();
+    memset(grad_weight_data, 0, grad_weight.nbytes());
+  }
+
+  if (output_mask[2]) {
+    grad_bias_data = grad_bias.mutable_data_ptr<CTYPE>();
+    memset(grad_bias_data, 0, grad_bias.nbytes());
+  }
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  exec_aten::SizesType out_coord[kTensorDimensionLimit];
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  exec_aten::SizesType in_coord[kTensorDimensionLimit];
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  exec_aten::SizesType weight_coord[kTensorDimensionLimit];
+
+  // Compute gradients
+  for (int64_t b = 0; b < batch_size; ++b) { // Loop over each batch
+    in_coord[0] = b;
+    out_coord[0] = b;
+    for (int64_t g = 0; g < groups; ++g) { // Loop over each group
+      for (int64_t h = 0; h < out_height; ++h) { // Loop over each output row
+        out_coord[2] = h;
+        for (int64_t w = 0; w < out_width; ++w) { // Loop over each output col
+          out_coord[3] = w;
+
+          // Loop over each output channel in the group
+          for (int64_t oc = 0; oc < out_channels_per_group; ++oc) {
+            int64_t oc_global = oc + g * out_channels_per_group;
+            weight_coord[0] = oc_global;
+            out_coord[1] = oc_global;
+
+            int64_t out_idx = calculate_linear_index(
+                out_coord, grad_output.strides().data(), 4);
+
+            // Accumulate the gradient with respect to the bias if required
+            if (output_mask[2]) {
+              grad_bias_data[oc_global] += grad_output_data[out_idx];
+            }
+
+            // Loop over each input channel in the group
+            for (int64_t ic = 0; ic < in_channels_per_group; ++ic) {
+              int64_t ic_global = ic + g * in_channels_per_group;
+              in_coord[1] = ic_global;
+              weight_coord[1] = ic;
+
+              // Loop over each element
+              for (int64_t kh = 0; kh < kernel_height; ++kh) {
+                int64_t in_h = h * stride_h - padding_h + kh * dilation_h;
+                if (in_h >= 0 && in_h < in_height) {
+                  in_coord[2] = in_h;
+                  weight_coord[2] = kh;
+
+                  for (int64_t kw = 0; kw < kernel_width; ++kw) {
+                    int64_t in_w = w * stride_w - padding_w + kw * dilation_w;
+                    if (in_w >= 0 && in_w < in_width) {
+                      in_coord[3] = in_w;
+                      weight_coord[3] = kw;
+
+                      int64_t in_idx = calculate_linear_index(
+                          in_coord, input.strides().data(), 4);
+
+                      int64_t weight_idx = calculate_linear_index(
+                          weight_coord, weight.strides().data(), 4);
+
+                      // Gradient with respect to the input if required
+                      if (output_mask[0]) {
+                        grad_input_data[in_idx] +=
+                            grad_output_data[out_idx] * weight_data[weight_idx];
+                      }
+                      // Gradient with respect to the weight if required
+                      if (output_mask[1]) {
+                        grad_weight_data[weight_idx] +=
+                            grad_output_data[out_idx] * input_data[in_idx];
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+std::tuple<Tensor&, Tensor&, Tensor&> convolution_backward_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& weight,
+    const OptIntArrayRef bias_sizes_opt,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool transposed,
+    IntArrayRef output_padding,
+    int64_t groups,
+    exec_aten::ArrayRef<bool> output_mask,
+    Tensor& grad_input,
+    Tensor& grad_weight,
+    Tensor& grad_bias) {
+  (void)ctx;
+
+  std::tuple<Tensor&, Tensor&, Tensor&> ret_val(
+      grad_input, grad_weight, grad_bias);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      check_convolution_backward_args(
+          grad_output,
+          input,
+          weight,
+          bias_sizes_opt,
+          stride,
+          padding,
+          dilation,
+          transposed,
+          output_padding,
+          groups,
+          output_mask,
+          grad_input,
+          grad_weight,
+          grad_bias),
+      InvalidArgument,
+      ret_val);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(grad_input, input.sizes()) == Error::Ok,
+      InvalidArgument,
+      ret_val);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(grad_weight, weight.sizes()) == Error::Ok,
+      InvalidArgument,
+      ret_val);
+
+  if (bias_sizes_opt.has_value()) {
+    ET_KERNEL_CHECK(
+        ctx,
+        resize_tensor(grad_bias, bias_sizes_opt.value()) == Error::Ok,
+        InvalidArgument,
+        ret_val);
+  }
+
+  constexpr auto name = "convolution_backward.out";
+
+  ET_SWITCH_FLOATH_TYPES(input.scalar_type(), ctx, name, CTYPE, [&]() {
+    conv2d_backward_impl<CTYPE>(
+        grad_output,
+        input,
+        weight,
+        stride,
+        padding,
+        dilation,
+        groups,
+        output_mask,
+        grad_input,
+        grad_weight,
+        grad_bias);
+  });
+
+  return ret_val;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/op_copy.cpp b/kernels/portable/cpu/op_copy.cpp
index 900b6e39d34..013027d30ba 100644
--- a/kernels/portable/cpu/op_copy.cpp
+++ b/kernels/portable/cpu/op_copy.cpp
@@ -22,7 +22,7 @@ using Tensor = exec_aten::Tensor;
 // TODO: We actually shouldn't see this op with the proper functionalization,
 // and this op needs to be deleted
 Tensor& copy_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const Tensor& src,
     bool non_blocking,
@@ -39,11 +39,14 @@ Tensor& copy_out(
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ScalarType in_type = in.scalar_type();
   ScalarType src_type = src.scalar_type();
 
-  ET_SWITCH_REALHB_TYPES(in_type, ctx, "copy.out", CTYPE, [&]() {
-    ET_SWITCH_REALHB_TYPES(src_type, ctx, "copy.out", CTYPE_SRC, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "copy.out", CTYPE, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(src_type, ctx, "copy.out", CTYPE_SRC, [&]() {
       apply_binary_elementwise_fn<CTYPE, CTYPE_SRC, CTYPE>(
           [](const CTYPE val_in, const CTYPE_SRC val_src) {
             return convert<CTYPE, CTYPE_SRC>(val_src);
@@ -57,8 +60,11 @@ Tensor& copy_out(
   return out;
 }
 
-Tensor&
-copy_(RuntimeContext& ctx, Tensor& in, const Tensor& src, bool non_blocking) {
+Tensor& copy_(
+    KernelRuntimeContext& ctx,
+    Tensor& in,
+    const Tensor& src,
+    bool non_blocking) {
   (void)ctx;
   // Right now we only support blocking data transfer
   ET_KERNEL_CHECK(ctx, non_blocking == false, InvalidArgument, in);
@@ -66,11 +72,14 @@ copy_(RuntimeContext& ctx, Tensor& in, const Tensor& src, bool non_blocking) {
   ET_KERNEL_CHECK(
       ctx, tensor_is_broadcastable_to(src, in), InvalidArgument, in);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, src), InvalidArgument, in);
+
   ScalarType in_type = in.scalar_type();
   ScalarType src_type = src.scalar_type();
 
-  ET_SWITCH_REALHB_TYPES(in_type, ctx, "copy_", CTYPE, [&]() {
-    ET_SWITCH_REALHB_TYPES(src_type, ctx, "copy_", CTYPE_SRC, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "copy_", CTYPE, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(src_type, ctx, "copy_", CTYPE_SRC, [&]() {
       apply_binary_elementwise_fn<CTYPE, CTYPE_SRC, CTYPE>(
           [](const CTYPE val_in, const CTYPE_SRC val_src) {
             return convert<CTYPE, CTYPE_SRC>(val_src);
diff --git a/kernels/portable/cpu/op_cos.cpp b/kernels/portable/cpu/op_cos.cpp
index be7adbca92d..56cf9236df2 100644
--- a/kernels/portable/cpu/op_cos.cpp
+++ b/kernels/portable/cpu/op_cos.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& cos_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& cos_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::cos, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_cosh.cpp b/kernels/portable/cpu/op_cosh.cpp
index c445e9d85d9..4f4e263286b 100644
--- a/kernels/portable/cpu/op_cosh.cpp
+++ b/kernels/portable/cpu/op_cosh.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& cosh_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& cosh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::cosh, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_cumsum.cpp b/kernels/portable/cpu/op_cumsum.cpp
index fffc2d46392..7b3eae5fe35 100644
--- a/kernels/portable/cpu/op_cumsum.cpp
+++ b/kernels/portable/cpu/op_cumsum.cpp
@@ -80,7 +80,7 @@ void cumsum_tensors(const Tensor& self, int64_t dim, Tensor& out) {
  * operation is performed. This is useful for preventing data type overflows.
  */
 Tensor& cumsum_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& self,
     int64_t dim,
     optional<ScalarType> enforced_dtype,
@@ -93,6 +93,9 @@ Tensor& cumsum_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, self.sizes()) == Error::Ok, InvalidArgument, out);
 
diff --git a/kernels/portable/cpu/op_detach_copy.cpp b/kernels/portable/cpu/op_detach_copy.cpp
index 844f259f6de..6cb069c3754 100644
--- a/kernels/portable/cpu/op_detach_copy.cpp
+++ b/kernels/portable/cpu/op_detach_copy.cpp
@@ -22,7 +22,8 @@ namespace {} // namespace
  * Copy the tener `self` to `out`, assume `self` and `out` have same type and
  * shape
  */
-Tensor& detach_copy_out(RuntimeContext& ctx, const Tensor& self, Tensor& out) {
+Tensor&
+detach_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) {
   (void)ctx;
 
   // Resize for dynamic shape
@@ -33,6 +34,9 @@ Tensor& detach_copy_out(RuntimeContext& ctx, const Tensor& self, Tensor& out) {
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_shape_and_dtype(self, out), InvalidArgument, out);
 
diff --git a/kernels/portable/cpu/op_diagonal_copy.cpp b/kernels/portable/cpu/op_diagonal_copy.cpp
index 67b14c3f792..8bb64b94d9b 100644
--- a/kernels/portable/cpu/op_diagonal_copy.cpp
+++ b/kernels/portable/cpu/op_diagonal_copy.cpp
@@ -62,7 +62,7 @@ void diagonal_copy_impl(
 } // namespace
 
 Tensor& diagonal_copy_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t offset,
     int64_t dim1,
@@ -73,6 +73,11 @@ Tensor& diagonal_copy_out(
   ET_KERNEL_CHECK(
       ctx, check_diagonal_copy_args(in, dim1, dim2, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   if (dim1 < 0) {
     dim1 += nonzero_dim(in);
   }
diff --git a/kernels/portable/cpu/op_div.cpp b/kernels/portable/cpu/op_div.cpp
index 84591cb0ebd..9a1c34c0f14 100644
--- a/kernels/portable/cpu/op_div.cpp
+++ b/kernels/portable/cpu/op_div.cpp
@@ -33,14 +33,20 @@ ScalarType get_compute_type(ScalarType a_type, ScalarType b_type) {
 
 } // namespace
 
-Tensor&
-div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
+Tensor& div_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
   ET_KERNEL_CHECK(
       ctx,
       resize_to_broadcast_target_size(a, b, out) == Error::Ok,
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
 
@@ -86,7 +92,7 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
 }
 
 Tensor& div_out_mode(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     exec_aten::optional<exec_aten::string_view> mode,
@@ -97,6 +103,9 @@ Tensor& div_out_mode(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = get_compute_type(a_type, b_type);
@@ -140,7 +149,7 @@ Tensor& div_out_mode(
 }
 
 Tensor& div_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -159,6 +168,9 @@ Tensor& div_scalar_out(
   ScalarType common_type = isFloatingType(a_type) ? a_type : ScalarType::Float;
   ScalarType out_type = out.scalar_type();
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out);
 
   ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.Scalar_out", CTYPE_A, [&]() {
@@ -185,7 +197,7 @@ Tensor& div_scalar_out(
 }
 
 Tensor& div_scalar_mode_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     exec_aten::optional<exec_aten::string_view> mode,
diff --git a/kernels/portable/cpu/op_embedding.cpp b/kernels/portable/cpu/op_embedding.cpp
index ffa43da7395..109a72b2f81 100644
--- a/kernels/portable/cpu/op_embedding.cpp
+++ b/kernels/portable/cpu/op_embedding.cpp
@@ -28,7 +28,7 @@ namespace {
 
 template <typename CTYPE>
 void embedding_kernel(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& weight,
     const Tensor& indices,
     Tensor& out) {
@@ -71,7 +71,7 @@ void embedding_kernel(
 // embedding.out(Tensor weight, Tensor indices, int padding_idx=-1, bool
 // scale_grad_by_freq=False, bool sparse=False, *, Tensor(a!) out) -> Tensor(a!)
 Tensor& embedding_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& weight,
     const Tensor& indices,
     int64_t padding_idx,
@@ -102,6 +102,15 @@ Tensor& embedding_out(
       out.size(1),
       weight.size(1));
 
+  ET_KERNEL_CHECK(
+      ctx,
+      tensors_have_same_dim_order(weight, indices, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensor_is_default_dim_order(weight), InvalidArgument, out);
+
   ScalarType ix_type = indices.scalar_type();
   ET_CHECK_MSG(
       ix_type == ScalarType::Long || ix_type == ScalarType::Int,
diff --git a/kernels/portable/cpu/op_empty.cpp b/kernels/portable/cpu/op_empty.cpp
index ed0807b7f4c..9b37a527c92 100644
--- a/kernels/portable/cpu/op_empty.cpp
+++ b/kernels/portable/cpu/op_empty.cpp
@@ -24,9 +24,9 @@ using exec_aten::Tensor;
  * empty.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
  */
 Tensor& empty_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     IntArrayRef size,
-    torch::executor::optional<torch::executor::MemoryFormat> memory_format,
+    exec_aten::optional<exec_aten::MemoryFormat> memory_format,
     Tensor& out) {
   (void)context;
 
diff --git a/kernels/portable/cpu/op_eq.cpp b/kernels/portable/cpu/op_eq.cpp
index 8a4e4656f08..552053556cc 100644
--- a/kernels/portable/cpu/op_eq.cpp
+++ b/kernels/portable/cpu/op_eq.cpp
@@ -20,7 +20,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& eq_tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -34,6 +34,9 @@ Tensor& eq_tensor_out(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "eq.Scalar_out", CTYPE_A, [&]() {
     ET_SWITCH_REAL_TYPES_AND(
         Bool, b_type, ctx, "eq.Scalar_out", CTYPE_B, [&]() {
@@ -62,7 +65,7 @@ Tensor& eq_tensor_out(
 }
 
 Tensor& eq_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -80,6 +83,9 @@ Tensor& eq_scalar_out(
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType out_type = out.scalar_type();
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "eq.Scalar_out", CTYPE_A, [&]() {
     ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "eq.Scalar_out", CTYPE_B, [&]() {
       using CTYPE_IN =
diff --git a/kernels/portable/cpu/op_erf.cpp b/kernels/portable/cpu/op_erf.cpp
index 04e43086ae4..bf85608d546 100644
--- a/kernels/portable/cpu/op_erf.cpp
+++ b/kernels/portable/cpu/op_erf.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& erf_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& erf_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::erf, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_exp.cpp b/kernels/portable/cpu/op_exp.cpp
index 519bcc185e6..c72d4d2954f 100644
--- a/kernels/portable/cpu/op_exp.cpp
+++ b/kernels/portable/cpu/op_exp.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& exp_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::exp, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_expand_copy.cpp b/kernels/portable/cpu/op_expand_copy.cpp
index 5f0d19adc59..26be7fa5a0c 100644
--- a/kernels/portable/cpu/op_expand_copy.cpp
+++ b/kernels/portable/cpu/op_expand_copy.cpp
@@ -54,7 +54,7 @@ size_t map_expand_to_repeats(
 } // namespace
 
 Tensor& expand_copy_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& self,
     ArrayRef<int64_t> expand_sizes,
     bool implicit,
@@ -85,6 +85,10 @@ Tensor& expand_copy_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out);
+
   // Holds the result of expand_sizes converted to repeat sizes
   int64_t repeats[kTensorDimensionLimit];
   const auto repeats_size{map_expand_to_repeats(
diff --git a/kernels/portable/cpu/op_expm1.cpp b/kernels/portable/cpu/op_expm1.cpp
index 7bc4c6cf479..96b94cdfa2a 100644
--- a/kernels/portable/cpu/op_expm1.cpp
+++ b/kernels/portable/cpu/op_expm1.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& expm1_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& expm1_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::expm1, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_fill.cpp b/kernels/portable/cpu/op_fill.cpp
index 60ebd5de5ab..50ef1359612 100644
--- a/kernels/portable/cpu/op_fill.cpp
+++ b/kernels/portable/cpu/op_fill.cpp
@@ -19,7 +19,7 @@ using ScalarType = exec_aten::ScalarType;
 using Tensor = exec_aten::Tensor;
 
 Tensor& fill_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -31,6 +31,9 @@ Tensor& fill_scalar_out(
 
   ET_KERNEL_CHECK(ctx, a_type == out_type, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
       ctx,
@@ -58,7 +61,7 @@ Tensor& fill_scalar_out(
 }
 
 Tensor& fill_tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -67,6 +70,9 @@ Tensor& fill_tensor_out(
   // Assert `b` must be a scalar tensor.
   ET_KERNEL_CHECK(ctx, tensor_is_scalar(b), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
diff --git a/kernels/portable/cpu/op_flip.cpp b/kernels/portable/cpu/op_flip.cpp
index 10c52439d11..fab4792a5a4 100644
--- a/kernels/portable/cpu/op_flip.cpp
+++ b/kernels/portable/cpu/op_flip.cpp
@@ -38,13 +38,19 @@ size_t unflip_flat_ix(size_t ix, const Tensor& in, ArrayRef<bool> flip_dim) {
 
 } // namespace
 
-Tensor&
-flip_out(RuntimeContext& ctx, const Tensor& in, IntArrayRef dims, Tensor& out) {
+Tensor& flip_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    IntArrayRef dims,
+    Tensor& out) {
   (void)ctx;
 
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(ctx, check_flip_args(in, dims, out), InvalidArgument, out);
 
   bool flip_dim_data[kTensorDimensionLimit];
diff --git a/kernels/portable/cpu/op_floor.cpp b/kernels/portable/cpu/op_floor.cpp
index bc914b688c6..b96265aa553 100644
--- a/kernels/portable/cpu/op_floor.cpp
+++ b/kernels/portable/cpu/op_floor.cpp
@@ -16,7 +16,7 @@ namespace native {
 
 using exec_aten::Tensor;
 
-Tensor& floor_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& floor_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realh(std::floor, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_floor_divide.cpp b/kernels/portable/cpu/op_floor_divide.cpp
index 0514df0ca25..c6a7902b3d2 100644
--- a/kernels/portable/cpu/op_floor_divide.cpp
+++ b/kernels/portable/cpu/op_floor_divide.cpp
@@ -75,7 +75,7 @@ struct FloorDivideInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
 } // namespace
 
 Tensor& floor_divide_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -87,6 +87,9 @@ Tensor& floor_divide_out(
 
   ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type);
diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp
index 42f83731199..98db14cc174 100644
--- a/kernels/portable/cpu/op_fmod.cpp
+++ b/kernels/portable/cpu/op_fmod.cpp
@@ -74,7 +74,7 @@ struct FmodInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
 } // namespace
 
 Tensor& fmod_Tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -85,6 +85,9 @@ Tensor& fmod_Tensor_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type);
@@ -125,7 +128,7 @@ Tensor& fmod_Tensor_out(
 }
 
 Tensor& fmod_Scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -139,6 +142,9 @@ Tensor& fmod_Scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType common_type = utils::promote_type_with_scalar(a_type, b);
diff --git a/kernels/portable/cpu/op_full.cpp b/kernels/portable/cpu/op_full.cpp
index cfa88f4d958..74b9657204f 100644
--- a/kernels/portable/cpu/op_full.cpp
+++ b/kernels/portable/cpu/op_full.cpp
@@ -17,7 +17,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& full_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const IntArrayRef sizes,
     const Scalar& fill_value,
     Tensor& out) {
@@ -34,11 +34,13 @@ Tensor& full_out(
       out,
       "Failed to resize output tensor.");
 
-  ET_SWITCH_REAL_TYPES_AND(Bool, val_type, ctx, "full.out", CTYPE_VAL, [&] {
+  constexpr auto name = "full.out";
+
+  ET_SWITCH_SCALAR_OBJ_TYPES(val_type, ctx, name, CTYPE_VAL, [&] {
     CTYPE_VAL val;
     utils::extract_scalar(fill_value, &val);
 
-    ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "full.out", CTYPE_OUT, [&] {
+    ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&] {
       CTYPE_OUT val_casted = static_cast<CTYPE_OUT>(val);
       auto data_out = out.mutable_data_ptr<CTYPE_OUT>();
       for (size_t i = 0; i < out.numel(); ++i) {
diff --git a/kernels/portable/cpu/op_full_like.cpp b/kernels/portable/cpu/op_full_like.cpp
index 880e02efe66..682a834b9cd 100644
--- a/kernels/portable/cpu/op_full_like.cpp
+++ b/kernels/portable/cpu/op_full_like.cpp
@@ -17,7 +17,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& full_like_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const Scalar& fill_value,
     optional<MemoryFormat> memory_format,
@@ -34,6 +34,11 @@ Tensor& full_like_out(
         "memory_format must be contiguous");
   }
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
       ctx,
diff --git a/kernels/portable/cpu/op_gather.cpp b/kernels/portable/cpu/op_gather.cpp
new file mode 100644
index 00000000000..b7d257ae3d9
--- /dev/null
+++ b/kernels/portable/cpu/op_gather.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cinttypes>
+#include <cstdint>
+#include <cstring>
+
+#include <executorch/kernels/portable/cpu/util/index_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using ScalarType = exec_aten::ScalarType;
+
+namespace {
+
+template <typename CTYPE>
+void gather_helper(
+    const Tensor& in,
+    const Tensor& index,
+    Tensor& out,
+    int64_t dim) {
+  const CTYPE* in_data = in.const_data_ptr<CTYPE>();
+  const long* index_data = index.const_data_ptr<long>();
+  CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
+
+  if (index.dim() == 0) {
+    out_data[0] = in_data[index_data[0]];
+    return;
+  }
+
+  for (size_t ix = 0; ix < index.numel(); ++ix) {
+    size_t ix_coord[kTensorDimensionLimit];
+    indexToCoordinate(index, ix, ix_coord);
+
+    size_t in_coord[kTensorDimensionLimit];
+    for (size_t i = 0; i < out.dim(); ++i) {
+      if (i == dim) {
+        in_coord[i] = index_data[ix];
+      } else {
+        in_coord[i] = ix_coord[i];
+      }
+    }
+
+    size_t in_ix = coordinateToIndex(in, in_coord);
+    size_t out_ix = coordinateToIndex(out, ix_coord);
+
+    out_data[out_ix] = in_data[in_ix];
+  }
+}
+
+} // namespace
+
+Tensor& gather_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    int64_t dim,
+    const Tensor& index,
+    bool sparse_grad,
+    Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx,
+      check_gather_args(in, dim, index, sparse_grad, out),
+      InvalidArgument,
+      out);
+
+  if (dim < 0) {
+    dim += nonzero_dim(in);
+  }
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, index.sizes()) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  constexpr auto name = "gather.out";
+
+  ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE, [&]() {
+    gather_helper<CTYPE>(in, index, out, dim);
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/op_ge.cpp b/kernels/portable/cpu/op_ge.cpp
index 88e056e7362..7195f80f65d 100644
--- a/kernels/portable/cpu/op_ge.cpp
+++ b/kernels/portable/cpu/op_ge.cpp
@@ -20,7 +20,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& ge_tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -31,6 +31,9 @@ Tensor& ge_tensor_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
@@ -63,7 +66,7 @@ Tensor& ge_tensor_out(
 }
 
 Tensor& ge_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -77,6 +80,9 @@ Tensor& ge_scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType common_type = utils::promote_type_with_scalar(a_type, b);
diff --git a/kernels/portable/cpu/op_gelu.cpp b/kernels/portable/cpu/op_gelu.cpp
index 0432c028141..db5d9cbfe71 100644
--- a/kernels/portable/cpu/op_gelu.cpp
+++ b/kernels/portable/cpu/op_gelu.cpp
@@ -22,7 +22,7 @@ using ScalarType = exec_aten::ScalarType;
 using string_view = exec_aten::string_view;
 
 Tensor& gelu_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     string_view approximate,
     Tensor& out) {
@@ -34,6 +34,9 @@ Tensor& gelu_out(
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_SWITCH_FLOAT_TYPES(in.scalar_type(), ctx, "gelu.out", CTYPE, [&]() {
     if (approximate == "tanh") {
       apply_unary_map_fn(
diff --git a/kernels/portable/cpu/op_glu.cpp b/kernels/portable/cpu/op_glu.cpp
index 5a075ff35ca..38bf8fd8db4 100644
--- a/kernels/portable/cpu/op_glu.cpp
+++ b/kernels/portable/cpu/op_glu.cpp
@@ -137,13 +137,19 @@ Tensor& glu_out_tensor(const Tensor& self, int64_t dim, Tensor& out) {
  *  1. The input shall be in any float types (Float, Double)
  *  2. The output shall be in float types (Float, Double)
  */
-Tensor&
-glu_out(RuntimeContext& ctx, const Tensor& self, int64_t dim, Tensor& out) {
+Tensor& glu_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    int64_t dim,
+    Tensor& out) {
   (void)ctx;
 
   ET_KERNEL_CHECK(
       ctx, resize_glu_out(self, dim, out) == Error::Ok, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(ctx, check_glu_args(self, dim, out), InvalidArgument, out);
 
   const size_t non_negative_dim = dim < 0 ? dim + self.dim() : dim;
diff --git a/kernels/portable/cpu/op_gt.cpp b/kernels/portable/cpu/op_gt.cpp
index 56d8657c9b5..e3cceedabc7 100644
--- a/kernels/portable/cpu/op_gt.cpp
+++ b/kernels/portable/cpu/op_gt.cpp
@@ -20,7 +20,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& gt_tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -31,6 +31,9 @@ Tensor& gt_tensor_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
@@ -63,7 +66,7 @@ Tensor& gt_tensor_out(
 }
 
 Tensor& gt_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -77,6 +80,9 @@ Tensor& gt_scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType common_type = utils::promote_type_with_scalar(a_type, b);
diff --git a/kernels/portable/cpu/op_hardtanh.cpp b/kernels/portable/cpu/op_hardtanh.cpp
index d61b932e06f..e86edab76b4 100644
--- a/kernels/portable/cpu/op_hardtanh.cpp
+++ b/kernels/portable/cpu/op_hardtanh.cpp
@@ -21,7 +21,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& hardtanh_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const Scalar& min,
     const Scalar& max,
@@ -36,6 +36,9 @@ Tensor& hardtanh_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ScalarType in_type = in.scalar_type();
   ScalarType min_type = utils::get_scalar_dtype(min);
   ScalarType max_type = utils::get_scalar_dtype(max);
diff --git a/kernels/portable/cpu/op_index.cpp b/kernels/portable/cpu/op_index.cpp
index d70ceaa859b..98f76a9e352 100644
--- a/kernels/portable/cpu/op_index.cpp
+++ b/kernels/portable/cpu/op_index.cpp
@@ -23,7 +23,7 @@ using Tensor = exec_aten::Tensor;
 using TensorOptList = exec_aten::ArrayRef<exec_aten::optional<Tensor>>;
 
 Tensor& index_Tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     TensorOptList indices,
     Tensor& out) {
@@ -32,6 +32,11 @@ Tensor& index_Tensor_out(
   ET_KERNEL_CHECK(
       ctx, check_index_args(in, indices, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   ScalarType in_type = in.scalar_type();
   size_t block_count = count_index_blocks(indices);
 
@@ -84,7 +89,7 @@ Tensor& index_Tensor_out(
   compute_dim_map(in, indices, dim_map, block_count == 1);
   compute_index_map(in, indices, ix_map);
 
-  ET_SWITCH_REALHB_TYPES(in_type, ctx, "index.Tensor_out", CTYPE, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "index.Tensor_out", CTYPE, [&]() {
     const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
     CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
diff --git a/kernels/portable/cpu/op_index_put.cpp b/kernels/portable/cpu/op_index_put.cpp
index 88cffe1bce7..33e67d207a9 100644
--- a/kernels/portable/cpu/op_index_put.cpp
+++ b/kernels/portable/cpu/op_index_put.cpp
@@ -19,7 +19,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& index_put_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     exec_aten::ArrayRef<exec_aten::optional<Tensor>> indices,
     const Tensor& values,
@@ -33,6 +33,11 @@ Tensor& index_put_out(
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dtype(in, values), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   ScalarType in_type = in.scalar_type();
   size_t block_count = count_index_blocks(indices);
 
@@ -48,7 +53,7 @@ Tensor& index_put_out(
     ET_KERNEL_CHECK(
         ctx, tensor_is_broadcastable_to(values, out), InvalidArgument, out);
 
-    ET_SWITCH_REALHB_TYPES(in_type, ctx, "index_put.out", CTYPE, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "index_put.out", CTYPE, [&]() {
       apply_binary_elementwise_fn<CTYPE, CTYPE, CTYPE>(
           [accumulate](const CTYPE val_in, const CTYPE val) {
             return accumulate ? val_in + val : val;
@@ -115,7 +120,7 @@ Tensor& index_put_out(
     x_numel *= x_sizes[i];
   }
 
-  ET_SWITCH_REALHB_TYPES(in_type, ctx, "index_put.out", CTYPE, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "index_put.out", CTYPE, [&]() {
     const CTYPE* const values_data = values.const_data_ptr<CTYPE>();
     CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
diff --git a/kernels/portable/cpu/op_index_select.cpp b/kernels/portable/cpu/op_index_select.cpp
index 59cbb40b83f..55a2eccf112 100644
--- a/kernels/portable/cpu/op_index_select.cpp
+++ b/kernels/portable/cpu/op_index_select.cpp
@@ -20,7 +20,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& index_select_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
     const Tensor& index,
@@ -28,6 +28,11 @@ Tensor& index_select_out(
   ET_KERNEL_CHECK(
       ctx, check_index_select_args(in, dim, index, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   if (dim < 0) {
     dim += nonzero_dim(in);
   }
diff --git a/kernels/portable/cpu/op_isinf.cpp b/kernels/portable/cpu/op_isinf.cpp
index 068f402c07d..92d1e563a2e 100644
--- a/kernels/portable/cpu/op_isinf.cpp
+++ b/kernels/portable/cpu/op_isinf.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& isinf_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& isinf_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   // Lambda is syntactic sugar needed to workaround compilation on some older
   // non-compatible distros where isnan is returning int rather than bool
   return internal::unary_ufunc_realhb_to_bool(
diff --git a/kernels/portable/cpu/op_isnan.cpp b/kernels/portable/cpu/op_isnan.cpp
index 09fb4f5f8ac..51e189992ee 100644
--- a/kernels/portable/cpu/op_isnan.cpp
+++ b/kernels/portable/cpu/op_isnan.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& isnan_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& isnan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   // Lambda is syntactic sugar needed to workaround compilation on some older
   // non-compatible distros where isnan is returning int rather than bool
   return internal::unary_ufunc_realhb_to_bool(
diff --git a/kernels/portable/cpu/op_le.cpp b/kernels/portable/cpu/op_le.cpp
index aa2c85d17d4..5b241dbb8a3 100644
--- a/kernels/portable/cpu/op_le.cpp
+++ b/kernels/portable/cpu/op_le.cpp
@@ -20,7 +20,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& le_tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -35,6 +35,9 @@ Tensor& le_tensor_out(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "le.Tensor_out", CTYPE_A, [&]() {
     ET_SWITCH_REAL_TYPES_AND(
         Bool, b_type, ctx, "le.Tensor_out", CTYPE_B, [&]() {
@@ -63,7 +66,7 @@ Tensor& le_tensor_out(
 }
 
 Tensor& le_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -77,6 +80,9 @@ Tensor& le_scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType common_type = utils::promote_type_with_scalar(a_type, b);
diff --git a/kernels/portable/cpu/op_leaky_relu.cpp b/kernels/portable/cpu/op_leaky_relu.cpp
index 1cb0a6a9401..90e91435e4c 100644
--- a/kernels/portable/cpu/op_leaky_relu.cpp
+++ b/kernels/portable/cpu/op_leaky_relu.cpp
@@ -21,7 +21,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& leaky_relu_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const Scalar& negative_slope,
     Tensor& out) {
@@ -35,6 +35,9 @@ Tensor& leaky_relu_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ScalarType in_type = in.scalar_type();
   ScalarType sc_type = utils::get_scalar_dtype(negative_slope);
   ScalarType out_type = out.scalar_type();
diff --git a/kernels/portable/cpu/op_lift_fresh_copy.cpp b/kernels/portable/cpu/op_lift_fresh_copy.cpp
index 2341d437d6c..bb49ab9e0bb 100644
--- a/kernels/portable/cpu/op_lift_fresh_copy.cpp
+++ b/kernels/portable/cpu/op_lift_fresh_copy.cpp
@@ -17,7 +17,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor&
-lift_fresh_copy_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+lift_fresh_copy_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
@@ -25,6 +25,9 @@ lift_fresh_copy_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   if (in.nbytes() > 0) {
     // Note that this check is important. It's valid for a tensor with numel 0
     // to have a null data pointer, but in some environments it's invalid to
diff --git a/kernels/portable/cpu/op_linear_scratch_example.cpp b/kernels/portable/cpu/op_linear_scratch_example.cpp
index 30535e9ee8b..1627e14ca79 100644
--- a/kernels/portable/cpu/op_linear_scratch_example.cpp
+++ b/kernels/portable/cpu/op_linear_scratch_example.cpp
@@ -118,7 +118,7 @@ Tensor& linear_scratch_example(
 }
 
 Tensor& linear_scratch_example(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
     const optional<Tensor>& bias,
diff --git a/kernels/portable/cpu/op_log.cpp b/kernels/portable/cpu/op_log.cpp
index 9f3f039fffc..0e959209714 100644
--- a/kernels/portable/cpu/op_log.cpp
+++ b/kernels/portable/cpu/op_log.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& log_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& log_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::log, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_log10.cpp b/kernels/portable/cpu/op_log10.cpp
index 48a0df64110..e617ab38c15 100644
--- a/kernels/portable/cpu/op_log10.cpp
+++ b/kernels/portable/cpu/op_log10.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& log10_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& log10_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::log10, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_log1p.cpp b/kernels/portable/cpu/op_log1p.cpp
index 15b98775698..b92344c2e76 100644
--- a/kernels/portable/cpu/op_log1p.cpp
+++ b/kernels/portable/cpu/op_log1p.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& log1p_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& log1p_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::log1p, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_log2.cpp b/kernels/portable/cpu/op_log2.cpp
index 4e640a402b8..19f4daa0d45 100644
--- a/kernels/portable/cpu/op_log2.cpp
+++ b/kernels/portable/cpu/op_log2.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& log2_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& log2_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::log2, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_log_softmax.cpp b/kernels/portable/cpu/op_log_softmax.cpp
index 34f43c48065..096fb4ff9c1 100644
--- a/kernels/portable/cpu/op_log_softmax.cpp
+++ b/kernels/portable/cpu/op_log_softmax.cpp
@@ -20,7 +20,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& log_softmax_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
     bool half_to_float,
@@ -36,6 +36,9 @@ Tensor& log_softmax_out(
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   // Adjust for negative dim
   dim = dim < 0 ? dim + nonzero_dim(in) : dim;
 
diff --git a/kernels/portable/cpu/op_logical_and.cpp b/kernels/portable/cpu/op_logical_and.cpp
index b9c76cbf1c6..51487ca6aa9 100644
--- a/kernels/portable/cpu/op_logical_and.cpp
+++ b/kernels/portable/cpu/op_logical_and.cpp
@@ -22,7 +22,7 @@ bool logical_and(bool a, bool b) {
 } // namespace
 
 Tensor& logical_and_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
diff --git a/kernels/portable/cpu/op_logical_not.cpp b/kernels/portable/cpu/op_logical_not.cpp
index c607fce8e21..cf10f572e43 100644
--- a/kernels/portable/cpu/op_logical_not.cpp
+++ b/kernels/portable/cpu/op_logical_not.cpp
@@ -16,7 +16,8 @@ namespace native {
 
 using exec_aten::Tensor;
 
-Tensor& logical_not_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor&
+logical_not_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   // Resize for dynamic shape
@@ -27,6 +28,9 @@ Tensor& logical_not_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(ctx, tensors_have_same_shape(in, out), InvalidArgument, out);
 
   ET_SWITCH_REAL_TYPES_AND(
diff --git a/kernels/portable/cpu/op_logical_or.cpp b/kernels/portable/cpu/op_logical_or.cpp
index e9c8ac7d037..15a4ca52bb7 100644
--- a/kernels/portable/cpu/op_logical_or.cpp
+++ b/kernels/portable/cpu/op_logical_or.cpp
@@ -22,7 +22,7 @@ bool logical_or(bool a, bool b) {
 } // namespace
 
 Tensor& logical_or_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
diff --git a/kernels/portable/cpu/op_logical_xor.cpp b/kernels/portable/cpu/op_logical_xor.cpp
index dde462e26c2..8c0b1b2c821 100644
--- a/kernels/portable/cpu/op_logical_xor.cpp
+++ b/kernels/portable/cpu/op_logical_xor.cpp
@@ -22,7 +22,7 @@ bool logical_xor(bool a, bool b) {
 } // namespace
 
 Tensor& logical_xor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
diff --git a/kernels/portable/cpu/op_logit.cpp b/kernels/portable/cpu/op_logit.cpp
index 7a54d91d0e3..faba847e844 100644
--- a/kernels/portable/cpu/op_logit.cpp
+++ b/kernels/portable/cpu/op_logit.cpp
@@ -18,7 +18,7 @@ namespace native {
 using exec_aten::Tensor;
 
 Tensor& logit_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     exec_aten::optional<double> eps,
     Tensor& out) {
@@ -28,6 +28,9 @@ Tensor& logit_out(
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
 
   ScalarType in_type = in.scalar_type();
diff --git a/kernels/portable/cpu/op_lt.cpp b/kernels/portable/cpu/op_lt.cpp
index 7fd9dd9bf6b..6f2ba4d6936 100644
--- a/kernels/portable/cpu/op_lt.cpp
+++ b/kernels/portable/cpu/op_lt.cpp
@@ -20,7 +20,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& lt_tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -31,6 +31,9 @@ Tensor& lt_tensor_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
@@ -63,7 +66,7 @@ Tensor& lt_tensor_out(
 }
 
 Tensor& lt_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -77,6 +80,9 @@ Tensor& lt_scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType common_type = utils::promote_type_with_scalar(a_type, b);
diff --git a/kernels/portable/cpu/op_masked_fill.cpp b/kernels/portable/cpu/op_masked_fill.cpp
index 7a72994b07a..643d5293ed4 100644
--- a/kernels/portable/cpu/op_masked_fill.cpp
+++ b/kernels/portable/cpu/op_masked_fill.cpp
@@ -20,7 +20,7 @@ using ScalarType = exec_aten::ScalarType;
 using Scalar = exec_aten::Scalar;
 
 Tensor& masked_fill_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const Tensor& mask,
     const Scalar& value,
@@ -39,6 +39,9 @@ Tensor& masked_fill_scalar_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, mask, out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES_AND(
       Bool, in_type, ctx, "masked_fill.Scalar_out", CTYPE, [&]() {
         ET_SWITCH_REAL_TYPES_AND(
diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp
index 8f363ced4e2..66eac09389c 100644
--- a/kernels/portable/cpu/op_max.cpp
+++ b/kernels/portable/cpu/op_max.cpp
@@ -23,7 +23,7 @@ using SizesType = exec_aten::SizesType;
 using Tensor = exec_aten::Tensor;
 
 std::tuple<Tensor&, Tensor&> max_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
     bool keepdim,
@@ -49,6 +49,24 @@ std::tuple<Tensor&, Tensor&> max_out(
       InvalidArgument,
       (std::tuple<Tensor&, Tensor&>({max, max_indices})));
 
+  ET_KERNEL_CHECK(
+      ctx,
+      tensors_have_same_dim_order(in, max),
+      InvalidArgument,
+      (std::tuple<Tensor&, Tensor&>({max, max_indices})));
+
+  ET_KERNEL_CHECK(
+      ctx,
+      tensor_is_default_dim_order(max_indices),
+      InvalidArgument,
+      (std::tuple<Tensor&, Tensor&>({max, max_indices})));
+
+  ET_KERNEL_CHECK(
+      ctx,
+      tensor_is_default_dim_order(in),
+      InvalidArgument,
+      (std::tuple<Tensor&, Tensor&>({max, max_indices})));
+
   dim = dim < 0 ? dim + in.dim() : dim;
 
   ET_SWITCH_REAL_TYPES_AND(
diff --git a/kernels/portable/cpu/op_max_pool2d_with_indices.cpp b/kernels/portable/cpu/op_max_pool2d_with_indices.cpp
index 7f377080b8c..80c291305b1 100644
--- a/kernels/portable/cpu/op_max_pool2d_with_indices.cpp
+++ b/kernels/portable/cpu/op_max_pool2d_with_indices.cpp
@@ -21,7 +21,7 @@ using ScalarType = exec_aten::ScalarType;
 using IntArrayRef = exec_aten::ArrayRef<int64_t>;
 
 std::tuple<Tensor&, Tensor&> max_pool2d_with_indices_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     IntArrayRef kernel_size,
     IntArrayRef stride,
diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp
index 1353479b294..a2075151357 100644
--- a/kernels/portable/cpu/op_maximum.cpp
+++ b/kernels/portable/cpu/op_maximum.cpp
@@ -63,7 +63,7 @@ struct MaximumInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
 } // namespace
 
 Tensor& maximum_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -75,6 +75,9 @@ Tensor& maximum_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp
index 79e66c62b5e..aeb0d7f8ca5 100644
--- a/kernels/portable/cpu/op_mean.cpp
+++ b/kernels/portable/cpu/op_mean.cpp
@@ -19,7 +19,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& mean_dim_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     optional<ArrayRef<int64_t>> dim_list,
     bool keepdim,
@@ -33,6 +33,11 @@ Tensor& mean_dim_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   ET_KERNEL_CHECK(
       ctx,
       resize_reduction_out(in, dim_list, keepdim, out) == Error::Ok,
diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp
index 8e3b5a00b36..a5ab490d7ba 100644
--- a/kernels/portable/cpu/op_min.cpp
+++ b/kernels/portable/cpu/op_min.cpp
@@ -23,7 +23,7 @@ using SizesType = exec_aten::SizesType;
 using Tensor = exec_aten::Tensor;
 
 std::tuple<Tensor&, Tensor&> min_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
     bool keepdim,
@@ -49,6 +49,24 @@ std::tuple<Tensor&, Tensor&> min_out(
       InvalidArgument,
       (std::tuple<Tensor&, Tensor&>({min, min_indices})));
 
+  ET_KERNEL_CHECK(
+      ctx,
+      tensors_have_same_dim_order(in, min),
+      InvalidArgument,
+      (std::tuple<Tensor&, Tensor&>({min, min_indices})));
+
+  ET_KERNEL_CHECK(
+      ctx,
+      tensor_is_default_dim_order(min_indices),
+      InvalidArgument,
+      (std::tuple<Tensor&, Tensor&>({min, min_indices})));
+
+  ET_KERNEL_CHECK(
+      ctx,
+      tensor_is_default_dim_order(in),
+      InvalidArgument,
+      (std::tuple<Tensor&, Tensor&>({min, min_indices})));
+
   dim = dim < 0 ? dim + in.dim() : dim;
 
   ET_SWITCH_REAL_TYPES_AND(
diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp
index f18d1a6d368..cfcb3fe9aca 100644
--- a/kernels/portable/cpu/op_minimum.cpp
+++ b/kernels/portable/cpu/op_minimum.cpp
@@ -63,7 +63,7 @@ struct MinimumInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
 } // namespace
 
 Tensor& minimum_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -75,6 +75,9 @@ Tensor& minimum_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
diff --git a/kernels/portable/cpu/op_mm.cpp b/kernels/portable/cpu/op_mm.cpp
index 6903bf3cad5..80717d0f94f 100644
--- a/kernels/portable/cpu/op_mm.cpp
+++ b/kernels/portable/cpu/op_mm.cpp
@@ -16,8 +16,11 @@ namespace native {
 
 using Tensor = exec_aten::Tensor;
 
-Tensor&
-mm_out(RuntimeContext& ctx, const Tensor& in, const Tensor& mat2, Tensor& out) {
+Tensor& mm_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& mat2,
+    Tensor& out) {
   ET_KERNEL_CHECK(ctx, check_mm_args(in, mat2, out), InvalidArgument, out);
 
   size_t output_ndim = 0;
@@ -29,19 +32,25 @@ mm_out(RuntimeContext& ctx, const Tensor& in, const Tensor& mat2, Tensor& out) {
       InvalidArgument,
       out);
 
-  ET_SWITCH_REAL_TYPES_AND(Half, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() {
-    size_t m = in.size(0);
-    size_t n = in.size(1);
-    size_t p = mat2.size(1);
-
-    vec_matmul<CTYPE>(
-        out.mutable_data_ptr<CTYPE>(),
-        in.const_data_ptr<CTYPE>(),
-        mat2.const_data_ptr<CTYPE>(),
-        m,
-        n,
-        p);
-  });
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, mat2, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
+  ET_SWITCH_REAL_TYPES_AND2(
+      Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() {
+        size_t m = in.size(0);
+        size_t n = in.size(1);
+        size_t p = mat2.size(1);
+
+        vec_matmul<CTYPE>(
+            out.mutable_data_ptr<CTYPE>(),
+            in.const_data_ptr<CTYPE>(),
+            mat2.const_data_ptr<CTYPE>(),
+            m,
+            n,
+            p);
+      });
 
   return out;
 }
diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
index c933d10d274..3e3b601e726 100644
--- a/kernels/portable/cpu/op_mul.cpp
+++ b/kernels/portable/cpu/op_mul.cpp
@@ -62,15 +62,25 @@ struct MulInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
     : public ReportCanCastBug {};
 } // namespace
 
-Tensor&
-mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
+Tensor& mul_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
   ET_KERNEL_CHECK(
       ctx,
       resize_to_broadcast_target_size(a, b, out) == Error::Ok,
       InvalidArgument,
       out);
 
-  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_realhbbf16_type(out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
@@ -79,12 +89,12 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
 
   ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
 
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
-    ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
       using CTYPE_IN = typename torch::executor::
           promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
       ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-      ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
+      ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
         MulInner<
             can_cast<CTYPE_IN, CTYPE_OUT>::value,
             CTYPE_A,
@@ -99,7 +109,7 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
 }
 
 Tensor& mul_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -113,7 +123,14 @@ Tensor& mul_scalar_out(
       out,
       "Failed to resize output tensor.");
 
-  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_realhbbf16_type(out),
+      InvalidArgument,
+      out);
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
@@ -123,15 +140,15 @@ Tensor& mul_scalar_out(
 
   ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out);
 
-  if (common_type == ScalarType::Half) {
+  if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) {
     common_type = ScalarType::Float;
   }
 
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() {
     ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "mul.Scalar_out", CTYPE_B, [&]() {
       ET_SWITCH_REALB_TYPES(
           common_type, ctx, "mul.Scalar_out", CTYPE_IN, [&]() {
-            ET_SWITCH_REALHB_TYPES(
+            ET_SWITCH_REALHBBF16_TYPES(
                 out_type, ctx, "mul.Scalar_out", CTYPE_OUT, [&]() {
                   CTYPE_B b_val;
                   utils::extract_scalar(b, &b_val);
diff --git a/kernels/portable/cpu/op_narrow_copy.cpp b/kernels/portable/cpu/op_narrow_copy.cpp
new file mode 100644
index 00000000000..762e0ad80e8
--- /dev/null
+++ b/kernels/portable/cpu/op_narrow_copy.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/slice_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <cstring>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+
+Tensor& narrow_copy_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    int64_t dim,
+    int64_t start,
+    int64_t length,
+    Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx,
+      check_narrow_copy_args(in, dim, start, length, out),
+      InvalidArgument,
+      out);
+
+  if (dim < 0) {
+    dim += in.dim();
+  }
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  Tensor::SizesType target_sizes[kTensorDimensionLimit];
+  size_t target_ndim = 0;
+  get_narrow_copy_out_target_size(in, dim, length, target_sizes, &target_ndim);
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {target_sizes, target_ndim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  if (length != 0) {
+    compute_slice(in, dim, start, length, 1, out);
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/op_native_batch_norm.cpp b/kernels/portable/cpu/op_native_batch_norm.cpp
index 2e613c0a637..546212a6b3f 100644
--- a/kernels/portable/cpu/op_native_batch_norm.cpp
+++ b/kernels/portable/cpu/op_native_batch_norm.cpp
@@ -20,7 +20,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_no_training_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const exec_aten::optional<Tensor>& weight,
     const exec_aten::optional<Tensor>& bias,
@@ -73,6 +73,28 @@ std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_no_training_out(
       InvalidArgument,
       ret_val);
 
+  ET_KERNEL_CHECK(
+      ctx,
+      tensors_have_same_dim_order(in, out, mean_out, invstd_out),
+      InvalidArgument,
+      ret_val);
+
+  if (weight.has_value()) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(in, weight.value()),
+        InvalidArgument,
+        ret_val);
+  }
+
+  if (bias.has_value()) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(in, bias.value()),
+        InvalidArgument,
+        ret_val);
+  }
+
   size_t C_dim = in.dim() >= 1 ? 1 : 0;
   size_t C = in.size(C_dim);
   size_t outer = getLeadingDims(in, C_dim);
@@ -113,7 +135,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_no_training_out(
 }
 
 std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const exec_aten::optional<Tensor>& weight,
     const exec_aten::optional<Tensor>& bias,
@@ -151,7 +173,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_out(
 }
 
 std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_no_stats_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const exec_aten::optional<Tensor>& weight,
     const exec_aten::optional<Tensor>& bias,
diff --git a/kernels/portable/cpu/op_native_group_norm.cpp b/kernels/portable/cpu/op_native_group_norm.cpp
index f9213fdeb1b..cf2d19776b3 100644
--- a/kernels/portable/cpu/op_native_group_norm.cpp
+++ b/kernels/portable/cpu/op_native_group_norm.cpp
@@ -113,7 +113,7 @@ void group_norm(
 } // namespace
 
 std::tuple<Tensor&, Tensor&, Tensor&> native_group_norm_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     const exec_aten::optional<Tensor>& weight,
     const exec_aten::optional<Tensor>& bias,
@@ -158,6 +158,31 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_group_norm_out(
       InvalidArgument,
       ret_val);
 
+  ET_KERNEL_CHECK(
+      ctx, tensor_is_default_dim_order(input), InvalidArgument, ret_val);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      tensors_have_same_dim_order(input, out, mean_out, rstd_out),
+      InvalidArgument,
+      ret_val);
+
+  if (weight.has_value()) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(input, weight.value()),
+        InvalidArgument,
+        ret_val);
+  }
+
+  if (bias.has_value()) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(input, bias.value()),
+        InvalidArgument,
+        ret_val);
+  }
+
   constexpr auto name = "native_group_norm.out";
 
   ET_SWITCH_FLOAT_TYPES(input.scalar_type(), ctx, name, CTYPE, [&]() {
diff --git a/kernels/portable/cpu/op_native_layer_norm.cpp b/kernels/portable/cpu/op_native_layer_norm.cpp
index f10acda10ee..36417e952de 100644
--- a/kernels/portable/cpu/op_native_layer_norm.cpp
+++ b/kernels/portable/cpu/op_native_layer_norm.cpp
@@ -97,7 +97,7 @@ void layer_norm(
 // As a reference, there's math_native_layer_norm in ATen:
 // https://www.internalfb.com/code/fbsource/[2da5b17b086554c6cd0c3ab08a35aeec2a8bad8c]/xplat/caffe2/aten/src/ATen/native/layer_norm.cpp?lines=188
 std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     IntArrayRef normalized_shape,
     const exec_aten::optional<Tensor>& weight,
@@ -117,6 +117,33 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
       InvalidArgument,
       ret_val);
 
+  // Only support default dim order for now.
+  // TODO: Support other dim orders.
+  ET_KERNEL_CHECK(
+      ctx, tensor_is_default_dim_order(input), InvalidArgument, ret_val);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      tensors_have_same_dim_order(input, out, mean_out, rstd_out),
+      InvalidArgument,
+      ret_val);
+
+  if (weight.has_value()) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(input, weight.value()),
+        InvalidArgument,
+        ret_val);
+  }
+
+  if (bias.has_value()) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(input, bias.value()),
+        InvalidArgument,
+        ret_val);
+  }
+
   Tensor::SizesType mean_rstd_sizes[kTensorDimensionLimit];
   size_t mean_rstd_ndim = 0;
   get_layer_norm_out_target_size(
diff --git a/kernels/portable/cpu/op_ne.cpp b/kernels/portable/cpu/op_ne.cpp
index 5601fdafbd1..194b9de3720 100644
--- a/kernels/portable/cpu/op_ne.cpp
+++ b/kernels/portable/cpu/op_ne.cpp
@@ -20,7 +20,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& ne_tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -30,6 +30,9 @@ Tensor& ne_tensor_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
@@ -62,7 +65,7 @@ Tensor& ne_tensor_out(
 }
 
 Tensor& ne_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -75,6 +78,9 @@ Tensor& ne_scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType out_type = out.scalar_type();
diff --git a/kernels/portable/cpu/op_neg.cpp b/kernels/portable/cpu/op_neg.cpp
index 026d1009c49..a4e6a8ad256 100644
--- a/kernels/portable/cpu/op_neg.cpp
+++ b/kernels/portable/cpu/op_neg.cpp
@@ -16,7 +16,7 @@ namespace native {
 
 using exec_aten::Tensor;
 
-Tensor& neg_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& neg_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   // Resize for dynamic shape
@@ -30,6 +30,9 @@ Tensor& neg_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "neg.out", CTYPE, [&] {
     apply_unary_map_fn(
         [](const CTYPE val_in) { return static_cast<CTYPE>(-val_in); },
diff --git a/kernels/portable/cpu/op_nonzero.cpp b/kernels/portable/cpu/op_nonzero.cpp
index 6cf7fd81da6..6c149ec4de5 100644
--- a/kernels/portable/cpu/op_nonzero.cpp
+++ b/kernels/portable/cpu/op_nonzero.cpp
@@ -39,7 +39,7 @@ void increment_index(size_t* index, const ArrayRef<SizesType> sizes) {
  * out to the appropriate size, and then loop again and properly write into out
  */
 template <typename CTYPE>
-void nonzero(RuntimeContext& ctx, const Tensor& input, Tensor& output) {
+void nonzero(KernelRuntimeContext& ctx, const Tensor& input, Tensor& output) {
   const CTYPE* in_data = input.const_data_ptr<CTYPE>();
   size_t lim = input.numel();
   int32_t num_nonzero = 0;
@@ -83,7 +83,7 @@ void nonzero(RuntimeContext& ctx, const Tensor& input, Tensor& output) {
  * Determines the non zero indices of input.
  * Out is a 2-D tensor where every row is a non zero index of the input.
  */
-Tensor& nonzero_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& nonzero_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   ET_KERNEL_CHECK(ctx, check_nonzero_args(in, out), InvalidArgument, out);
diff --git a/kernels/portable/cpu/op_ones.cpp b/kernels/portable/cpu/op_ones.cpp
index 61595a8c214..6fcd1dfdbca 100644
--- a/kernels/portable/cpu/op_ones.cpp
+++ b/kernels/portable/cpu/op_ones.cpp
@@ -12,7 +12,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& ones_out(RuntimeContext& ctx, IntArrayRef size, Tensor& out) {
+Tensor& ones_out(KernelRuntimeContext& ctx, IntArrayRef size, Tensor& out) {
   (void)ctx;
 
   // Resize for dynamic shape
diff --git a/kernels/portable/cpu/op_pdist_forward.cpp b/kernels/portable/cpu/op_pdist_forward.cpp
index 88b5e881943..04217cc8eb4 100644
--- a/kernels/portable/cpu/op_pdist_forward.cpp
+++ b/kernels/portable/cpu/op_pdist_forward.cpp
@@ -16,7 +16,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& _pdist_forward_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     double p,
     Tensor& out) {
@@ -24,6 +24,11 @@ Tensor& _pdist_forward_out(
 
   ET_KERNEL_CHECK(ctx, check_pdist_args(in, p, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   Tensor::SizesType target_sizes[kTensorDimensionLimit];
   size_t target_ndim = 0;
   get_pdist_out_target_size(in, target_sizes, &target_ndim);
diff --git a/kernels/portable/cpu/op_permute_copy.cpp b/kernels/portable/cpu/op_permute_copy.cpp
index e7df5c9657d..e75b34f28da 100644
--- a/kernels/portable/cpu/op_permute_copy.cpp
+++ b/kernels/portable/cpu/op_permute_copy.cpp
@@ -37,7 +37,7 @@ void increment_coordinate_permuted(
 } // namespace
 
 Tensor& permute_copy_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     IntArrayRef dims,
     Tensor& out) {
@@ -46,6 +46,9 @@ Tensor& permute_copy_out(
   ET_KERNEL_CHECK(
       ctx, check_permute_copy_args(in, dims, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   Tensor::SizesType expected_out_size[kTensorDimensionLimit];
   size_t expected_out_dim = 0;
   get_permute_copy_out_target_size(
@@ -57,15 +60,20 @@ Tensor& permute_copy_out(
       out);
 
   const auto in_type = out.scalar_type();
+
+  size_t in_coord[kTensorDimensionLimit] = {0};
+  size_t trailing_dims_memo[kTensorDimensionLimit];
+  executorch::runtime::memoizeTrailingDims(in, trailing_dims_memo);
+
   // in and out must be the same dtype
   ET_SWITCH_ALL_TYPES(in_type, ctx, "permute_copy.out", CTYPE, [&] {
     const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
     CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
-    size_t in_coord[kTensorDimensionLimit] = {0};
-
     for (size_t i = 0; i < out.numel(); ++i) {
-      out_data[i] = in_data[coordinateToIndex(in, in_coord)];
+      out_data[i] =
+          in_data[executorch::runtime::coordinateToIndexWithTrailingDimsMemo(
+              in, in_coord, trailing_dims_memo)];
       increment_coordinate_permuted(in, in_coord, dims);
     }
   });
diff --git a/kernels/portable/cpu/op_pixel_shuffle.cpp b/kernels/portable/cpu/op_pixel_shuffle.cpp
index 87217cbb9e1..a191048386b 100644
--- a/kernels/portable/cpu/op_pixel_shuffle.cpp
+++ b/kernels/portable/cpu/op_pixel_shuffle.cpp
@@ -12,12 +12,55 @@
 namespace torch {
 namespace executor {
 namespace native {
+namespace {
+
+template <typename CTYPE>
+void pixel_shuffle_impl(const Tensor& in, int64_t upscale_factor, Tensor& out) {
+  const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
+  CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
+
+  const auto leading_dims = getLeadingDims(in, in.dim() - 3);
+  const auto channels = in.size(in.dim() - 3);
+  const auto height = in.size(in.dim() - 2);
+  const auto width = in.size(in.dim() - 1);
+
+  const auto sub_channels = channels / (upscale_factor * upscale_factor);
+  const auto S = upscale_factor;
+
+  // input strides
+  const auto stride_n = channels * height * width;
+  const auto stride_c = S * S * height * width;
+  const auto stride_s1 = S * height * width;
+  const auto stride_s2 = height * width;
+  const auto stride_h = width;
+
+  // input tensor shape of [n, c, s1, s2, h, w]
+  // output tensor shape of [n, c, h, s1, w, s2]
+  size_t i = 0;
+  for (size_t n = 0; n < leading_dims; n++) {
+    for (size_t c = 0; c < sub_channels; c++) {
+      for (size_t h = 0; h < height; h++) {
+        for (size_t s1 = 0; s1 < S; s1++) {
+          for (size_t w = 0; w < width; w++) {
+            for (size_t s2 = 0; s2 < S; s2++) {
+              size_t input_offset = n * stride_n + c * stride_c +
+                  s1 * stride_s1 + s2 * stride_s2 + h * stride_h + w;
+              out_data[i++] = in_data[input_offset];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace
 
 using SizesType = exec_aten::SizesType;
 using Tensor = exec_aten::Tensor;
 
 Tensor& pixel_shuffle_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t upscale_factor,
     Tensor& out) {
@@ -29,11 +72,10 @@ Tensor& pixel_shuffle_out(
       InvalidArgument,
       out);
 
-  const Tensor::SizesType leading_dims = getLeadingDims(in, in.dim() - 3);
-  const Tensor::SizesType channels = in.size(in.dim() - 3);
-  const Tensor::SizesType height = in.size(in.dim() - 2);
-  const Tensor::SizesType width = in.size(in.dim() - 1);
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
   Tensor::SizesType expected_out_size[kTensorDimensionLimit];
   size_t expected_out_dim = 0;
   get_pixel_shuffle_out_target_size(
@@ -46,47 +88,13 @@ Tensor& pixel_shuffle_out(
       InvalidArgument,
       out);
 
+  constexpr auto name = "pixel_shuffle.out";
+
   const auto in_type = out.scalar_type();
   // in and out must be the same dtype
-  ET_SWITCH_ALL_TYPES(
-      in_type,
-      ctx,
-      "pixel_shuffle.out",
-      CTYPE,
-      [leading_dims, channels, height, width, upscale_factor, &in, &out] {
-        const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
-        CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
-
-        const int64_t sub_channels =
-            channels / (upscale_factor * upscale_factor);
-        const int64_t S = upscale_factor;
-
-        // input strides
-        int64_t stride_n = channels * height * width;
-        int64_t stride_c = S * S * height * width;
-        int64_t stride_s1 = S * height * width;
-        int64_t stride_s2 = height * width;
-        int64_t stride_h = width;
-
-        // input tensor shape of [n, c, s1, s2, h, w]
-        // output tensor shape of [n, c, h, s1, w, s2]
-        size_t i = 0;
-        for (size_t n = 0; n < leading_dims; n++) {
-          for (size_t c = 0; c < sub_channels; c++) {
-            for (size_t h = 0; h < height; h++) {
-              for (size_t s1 = 0; s1 < S; s1++) {
-                for (size_t w = 0; w < width; w++) {
-                  for (size_t s2 = 0; s2 < S; s2++) {
-                    int64_t input_offset = n * stride_n + c * stride_c +
-                        s1 * stride_s1 + s2 * stride_s2 + h * stride_h + w;
-                    out_data[i++] = in_data[input_offset];
-                  }
-                }
-              }
-            }
-          }
-        }
-      });
+  ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&]() {
+    pixel_shuffle_impl<CTYPE>(in, upscale_factor, out);
+  });
 
   return out;
 }
diff --git a/kernels/portable/cpu/op_pixel_unshuffle.cpp b/kernels/portable/cpu/op_pixel_unshuffle.cpp
new file mode 100644
index 00000000000..0cbc9756d92
--- /dev/null
+++ b/kernels/portable/cpu/op_pixel_unshuffle.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+namespace {
+
+template <typename CTYPE>
+void pixel_unshuffle_impl(
+    const Tensor& in,
+    int64_t downscale_factor,
+    Tensor& out) {
+  const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
+  CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
+
+  const auto leading_dims = getLeadingDims(in, in.dim() - 3);
+  const auto channels = out.size(in.dim() - 3);
+  const auto height = out.size(in.dim() - 2);
+  const auto width = out.size(in.dim() - 1);
+
+  const auto S = downscale_factor;
+  const auto sub_channels = channels / (S * S);
+
+  // output strides
+  const auto stride_n = channels * height * width;
+  const auto stride_c = S * S * height * width;
+  const auto stride_s1 = S * height * width;
+  const auto stride_s2 = height * width;
+  const auto stride_h = width;
+
+  // input tensor shape of [n, c, h, s1, w, s2]
+  // output tensor shape of [n, c, s1, s2, h, w]
+  size_t i = 0;
+  for (size_t n = 0; n < leading_dims; n++) {
+    for (size_t c = 0; c < sub_channels; c++) {
+      for (size_t h = 0; h < height; h++) {
+        for (size_t s1 = 0; s1 < S; s1++) {
+          for (size_t w = 0; w < width; w++) {
+            for (size_t s2 = 0; s2 < S; s2++) {
+              size_t output_offset = n * stride_n + c * stride_c +
+                  s1 * stride_s1 + s2 * stride_s2 + h * stride_h + w;
+              out_data[output_offset] = in_data[i++];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+using SizesType = exec_aten::SizesType;
+using Tensor = exec_aten::Tensor;
+
+Tensor& pixel_unshuffle_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    int64_t downscale_factor,
+    Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx,
+      check_pixel_unshuffle_args(in, downscale_factor, out),
+      InvalidArgument,
+      out);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  Tensor::SizesType expected_out_size[kTensorDimensionLimit];
+  size_t expected_out_dim = 0;
+  get_pixel_unshuffle_out_target_size(
+      in, downscale_factor, expected_out_size, &expected_out_dim);
+
+  // Make sure the output tensor is the right size.
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  constexpr auto name = "pixel_unshuffle.out";
+
+  const auto in_type = out.scalar_type();
+  // in and out must be the same dtype
+  ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&]() {
+    pixel_unshuffle_impl<CTYPE>(in, downscale_factor, out);
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/op_pow.cpp b/kernels/portable/cpu/op_pow.cpp
index 1555c8872c3..c0130933135 100644
--- a/kernels/portable/cpu/op_pow.cpp
+++ b/kernels/portable/cpu/op_pow.cpp
@@ -67,7 +67,7 @@ struct PowInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
 } // namespace
 
 Tensor& pow_Tensor_Tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -110,7 +110,7 @@ Tensor& pow_Tensor_Tensor_out(
 }
 
 Tensor& pow_Tensor_Scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -165,7 +165,7 @@ Tensor& pow_Tensor_Scalar_out(
 }
 
 Tensor& pow_Scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Scalar& a,
     const Tensor& b,
     Tensor& out) {
diff --git a/kernels/portable/cpu/op_prod.cpp b/kernels/portable/cpu/op_prod.cpp
index 9ed857468ae..9580dee2d12 100644
--- a/kernels/portable/cpu/op_prod.cpp
+++ b/kernels/portable/cpu/op_prod.cpp
@@ -17,7 +17,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& prod_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     optional<ScalarType> dtype,
     Tensor& out) {
@@ -48,7 +48,7 @@ Tensor& prod_out(
 }
 
 Tensor& prod_int_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
     bool keepdim,
diff --git a/kernels/portable/cpu/op_reciprocal.cpp b/kernels/portable/cpu/op_reciprocal.cpp
index e79ec522123..dc120119a35 100644
--- a/kernels/portable/cpu/op_reciprocal.cpp
+++ b/kernels/portable/cpu/op_reciprocal.cpp
@@ -20,7 +20,8 @@ double reciprocal(double x) {
 
 } // namespace
 
-Tensor& reciprocal_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor&
+reciprocal_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(reciprocal, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_reflection_pad1d.cpp b/kernels/portable/cpu/op_reflection_pad1d.cpp
index 66a2333619f..253097df177 100644
--- a/kernels/portable/cpu/op_reflection_pad1d.cpp
+++ b/kernels/portable/cpu/op_reflection_pad1d.cpp
@@ -16,7 +16,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& reflection_pad1d_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     exec_aten::ArrayRef<int64_t> padding,
     Tensor& out) {
@@ -28,6 +28,11 @@ Tensor& reflection_pad1d_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   Tensor::SizesType target_sizes[kTensorDimensionLimit];
   size_t target_ndim = 0;
   get_padding_out_target_size(1, in, padding, target_sizes, &target_ndim);
diff --git a/kernels/portable/cpu/op_reflection_pad2d.cpp b/kernels/portable/cpu/op_reflection_pad2d.cpp
index a16d92ff1ce..31bfa13aff9 100644
--- a/kernels/portable/cpu/op_reflection_pad2d.cpp
+++ b/kernels/portable/cpu/op_reflection_pad2d.cpp
@@ -16,7 +16,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& reflection_pad2d_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     exec_aten::ArrayRef<int64_t> padding,
     Tensor& out) {
@@ -28,6 +28,11 @@ Tensor& reflection_pad2d_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   Tensor::SizesType target_sizes[kTensorDimensionLimit];
   size_t target_ndim = 0;
   get_padding_out_target_size(2, in, padding, target_sizes, &target_ndim);
diff --git a/kernels/portable/cpu/op_reflection_pad3d.cpp b/kernels/portable/cpu/op_reflection_pad3d.cpp
index 9629b9e4c4e..889a38fbe2a 100644
--- a/kernels/portable/cpu/op_reflection_pad3d.cpp
+++ b/kernels/portable/cpu/op_reflection_pad3d.cpp
@@ -16,7 +16,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& reflection_pad3d_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     exec_aten::ArrayRef<int64_t> padding,
     Tensor& out) {
@@ -28,6 +28,11 @@ Tensor& reflection_pad3d_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   Tensor::SizesType target_sizes[kTensorDimensionLimit];
   size_t target_ndim = 0;
   get_padding_out_target_size(3, in, padding, target_sizes, &target_ndim);
diff --git a/kernels/portable/cpu/op_relu.cpp b/kernels/portable/cpu/op_relu.cpp
index b9136cb3392..2ec258e2c47 100644
--- a/kernels/portable/cpu/op_relu.cpp
+++ b/kernels/portable/cpu/op_relu.cpp
@@ -19,7 +19,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
-Tensor& relu_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& relu_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   // Resize for dynamic shape
@@ -35,6 +35,9 @@ Tensor& relu_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
 
   ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "relu.out", CTYPE, [&]() {
     apply_unary_map_fn(
         [](const CTYPE val_in) {
diff --git a/kernels/portable/cpu/op_remainder.cpp b/kernels/portable/cpu/op_remainder.cpp
index 7c858c1c08a..8f25a72167a 100644
--- a/kernels/portable/cpu/op_remainder.cpp
+++ b/kernels/portable/cpu/op_remainder.cpp
@@ -67,7 +67,7 @@ struct RemainderInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
 
 } // namespace
 Tensor& remainder_Tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -80,6 +80,9 @@ Tensor& remainder_Tensor_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type);
@@ -110,7 +113,7 @@ Tensor& remainder_Tensor_out(
 }
 
 Tensor& remainder_Scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
@@ -124,6 +127,9 @@ Tensor& remainder_Scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType common_type = utils::promote_type_with_scalar(a_type, b);
diff --git a/kernels/portable/cpu/op_repeat.cpp b/kernels/portable/cpu/op_repeat.cpp
index 644ebc98420..e136b8c7258 100644
--- a/kernels/portable/cpu/op_repeat.cpp
+++ b/kernels/portable/cpu/op_repeat.cpp
@@ -49,7 +49,7 @@ using Tensor = exec_aten::Tensor;
 
 // repeat.out(Tensor self, int[] repeats, *, Tensor(a!) out) -> Tensor(a!)
 Tensor& repeat_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& self,
     exec_aten::ArrayRef<int64_t> repeats,
     Tensor& out) {
@@ -62,6 +62,11 @@ Tensor& repeat_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out);
+
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
       ctx,
diff --git a/kernels/portable/cpu/op_replication_pad1d.cpp b/kernels/portable/cpu/op_replication_pad1d.cpp
index 8d1c12199e1..904e285f1a7 100644
--- a/kernels/portable/cpu/op_replication_pad1d.cpp
+++ b/kernels/portable/cpu/op_replication_pad1d.cpp
@@ -16,7 +16,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& replication_pad1d_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     exec_aten::ArrayRef<int64_t> padding,
     Tensor& out) {
diff --git a/kernels/portable/cpu/op_replication_pad2d.cpp b/kernels/portable/cpu/op_replication_pad2d.cpp
index 7763574afe8..1e8fd5b866b 100644
--- a/kernels/portable/cpu/op_replication_pad2d.cpp
+++ b/kernels/portable/cpu/op_replication_pad2d.cpp
@@ -16,7 +16,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& replication_pad2d_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     exec_aten::ArrayRef<int64_t> padding,
     Tensor& out) {
diff --git a/kernels/portable/cpu/op_replication_pad3d.cpp b/kernels/portable/cpu/op_replication_pad3d.cpp
index 767ffd9c9ec..e6b097b2d12 100644
--- a/kernels/portable/cpu/op_replication_pad3d.cpp
+++ b/kernels/portable/cpu/op_replication_pad3d.cpp
@@ -16,7 +16,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& replication_pad3d_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     exec_aten::ArrayRef<int64_t> padding,
     Tensor& out) {
diff --git a/kernels/portable/cpu/op_roll.cpp b/kernels/portable/cpu/op_roll.cpp
index 4eff081eec4..ade564c300b 100644
--- a/kernels/portable/cpu/op_roll.cpp
+++ b/kernels/portable/cpu/op_roll.cpp
@@ -47,7 +47,7 @@ size_t unshift_flat_ix(size_t ix, const Tensor& in, IntArrayRef dim_shifts) {
 } // namespace
 
 Tensor& roll_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     IntArrayRef shifts,
     IntArrayRef dims,
@@ -60,6 +60,9 @@ Tensor& roll_out(
   ET_KERNEL_CHECK(
       ctx, check_roll_args(in, shifts, dims, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   if (in.numel() == 0) {
     return out;
   }
diff --git a/kernels/portable/cpu/op_round.cpp b/kernels/portable/cpu/op_round.cpp
index 0b28ba41887..c14d75dc31e 100644
--- a/kernels/portable/cpu/op_round.cpp
+++ b/kernels/portable/cpu/op_round.cpp
@@ -30,7 +30,7 @@ inline CTYPE round_to_even(CTYPE a) {
 
 } // namespace
 
-Tensor& round_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& round_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   // Resize for dynamic shape
@@ -45,6 +45,9 @@ Tensor& round_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
       ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
   ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   auto in_scalar_type = in.scalar_type();
 
   ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "round.out", CTYPE, [&] {
diff --git a/kernels/portable/cpu/op_rsqrt.cpp b/kernels/portable/cpu/op_rsqrt.cpp
index df682c297bb..bb9a6dc4582 100644
--- a/kernels/portable/cpu/op_rsqrt.cpp
+++ b/kernels/portable/cpu/op_rsqrt.cpp
@@ -20,7 +20,7 @@ double rsqrt(double x) {
 
 } // namespace
 
-Tensor& rsqrt_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& rsqrt_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(rsqrt, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp
index 6a5ef598ef4..5445c2df1f7 100644
--- a/kernels/portable/cpu/op_rsub.cpp
+++ b/kernels/portable/cpu/op_rsub.cpp
@@ -16,7 +16,7 @@ namespace executor {
 namespace native {
 
 Tensor& rsub_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     const Scalar& alpha,
@@ -31,6 +31,9 @@ Tensor& rsub_scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
diff --git a/kernels/portable/cpu/op_scalar_tensor.cpp b/kernels/portable/cpu/op_scalar_tensor.cpp
index b69267c9917..f7a4acfdcee 100644
--- a/kernels/portable/cpu/op_scalar_tensor.cpp
+++ b/kernels/portable/cpu/op_scalar_tensor.cpp
@@ -13,7 +13,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& scalar_tensor_out(RuntimeContext& ctx, const Scalar& s, Tensor& out) {
+Tensor&
+scalar_tensor_out(KernelRuntimeContext& ctx, const Scalar& s, Tensor& out) {
   (void)ctx;
 
   ET_KERNEL_CHECK(
@@ -24,13 +25,14 @@ Tensor& scalar_tensor_out(RuntimeContext& ctx, const Scalar& s, Tensor& out) {
 
   constexpr auto name = "scalar_tensor.out";
 
-  ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE, [&]() {
-    ET_SWITCH_SCALAR_OBJ_TYPES(s_type, ctx, name, CTYPE_S, [&]() {
-      CTYPE_S val_s;
-      utils::extract_scalar(s, &val_s);
-      out.mutable_data_ptr<CTYPE>()[0] = convert<CTYPE, CTYPE_S>(val_s);
-    });
-  });
+  ET_SWITCH_REAL_TYPES_AND3(
+      Half, Bool, BFloat16, out_type, ctx, name, CTYPE, [&]() {
+        ET_SWITCH_SCALAR_OBJ_TYPES(s_type, ctx, name, CTYPE_S, [&]() {
+          CTYPE_S val_s;
+          utils::extract_scalar(s, &val_s);
+          out.mutable_data_ptr<CTYPE>()[0] = convert<CTYPE, CTYPE_S>(val_s);
+        });
+      });
 
   return out;
 }
diff --git a/kernels/portable/cpu/op_scatter.cpp b/kernels/portable/cpu/op_scatter.cpp
new file mode 100644
index 00000000000..ee9b202c6c9
--- /dev/null
+++ b/kernels/portable/cpu/op_scatter.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cinttypes>
+#include <cstdint>
+#include <cstring>
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/index_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using ScalarType = exec_aten::ScalarType;
+
+namespace {
+
+template <typename CTYPE>
+void scatter_src_helper(
+    const Tensor& in,
+    int64_t dim,
+    const Tensor& index,
+    const Tensor& src,
+    Tensor& out) {
+  const CTYPE* in_data = in.const_data_ptr<CTYPE>();
+  const long* index_data = index.const_data_ptr<long>();
+  const CTYPE* src_data = src.const_data_ptr<CTYPE>();
+  CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
+
+  memcpy(out_data, in_data, in.nbytes());
+
+  if (dim < 0) {
+    dim += nonzero_dim(in);
+  }
+
+  for (size_t ix = 0; ix < index.numel(); ++ix) {
+    // @lint-ignore CLANGTIDY facebook-hte-CArray
+    size_t ix_coord[kTensorDimensionLimit];
+    indexToCoordinate(index, ix, ix_coord);
+
+    size_t src_ix = coordinateToIndex(src, ix_coord);
+
+    // @lint-ignore CLANGTIDY facebook-hte-CArray
+    size_t out_coord[kTensorDimensionLimit];
+    for (size_t i = 0; i < out.dim(); ++i) {
+      if (i == dim) {
+        out_coord[i] = index_data[ix];
+      } else {
+        out_coord[i] = ix_coord[i];
+      }
+    }
+    size_t out_ix = coordinateToIndex(out, out_coord);
+
+    out_data[out_ix] = src_data[src_ix];
+  }
+}
+
+template <typename CTYPE, typename CTYPE_VAL>
+void scatter_value_helper(
+    const Tensor& in,
+    int64_t dim,
+    const Tensor& index,
+    CTYPE_VAL val,
+    Tensor& out) {
+  const CTYPE* in_data = in.const_data_ptr<CTYPE>();
+  const long* index_data = index.const_data_ptr<long>();
+  CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
+
+  memcpy(out_data, in_data, in.nbytes());
+
+  if (dim < 0) {
+    dim += nonzero_dim(in);
+  }
+
+  for (size_t ix = 0; ix < index.numel(); ++ix) {
+    // @lint-ignore CLANGTIDY facebook-hte-CArray
+    size_t ix_coord[kTensorDimensionLimit];
+    indexToCoordinate(index, ix, ix_coord);
+
+    // @lint-ignore CLANGTIDY facebook-hte-CArray
+    size_t out_coord[kTensorDimensionLimit];
+    for (size_t i = 0; i < out.dim(); ++i) {
+      if (i == dim) {
+        out_coord[i] = index_data[ix];
+      } else {
+        out_coord[i] = ix_coord[i];
+      }
+    }
+    size_t out_ix = coordinateToIndex(out, out_coord);
+
+    out_data[out_ix] = static_cast<CTYPE>(val);
+  }
+}
+
+} // namespace
+
+Tensor& scatter_src_out(
+    KernelRuntimeContext& context,
+    const Tensor& in,
+    int64_t dim,
+    const Tensor& index,
+    const Tensor& src,
+    Tensor& out) {
+  (void)context;
+
+  ET_KERNEL_CHECK(
+      context,
+      check_scatter_src_args(in, dim, index, src, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      context,
+      resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  constexpr auto name = "scatter.src_out";
+
+  ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE, [&]() {
+    scatter_src_helper<CTYPE>(in, dim, index, src, out);
+  });
+
+  return out;
+}
+
+Tensor& scatter_value_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    int64_t dim,
+    const Tensor& index,
+    const Scalar& value,
+    Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx,
+      check_scatter_value_args(in, dim, index, value, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
+
+  ScalarType val_type = utils::get_scalar_dtype(value);
+
+  constexpr auto name = "scatter.value_out";
+
+  ET_SWITCH_SCALAR_OBJ_TYPES(val_type, ctx, name, CTYPE_VAL, [&] {
+    CTYPE_VAL val;
+    utils::extract_scalar(value, &val);
+
+    ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE, [&]() {
+      scatter_value_helper<CTYPE>(in, dim, index, val, out);
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/op_scatter_add.cpp b/kernels/portable/cpu/op_scatter_add.cpp
index e10d87f9193..ec91594faf8 100644
--- a/kernels/portable/cpu/op_scatter_add.cpp
+++ b/kernels/portable/cpu/op_scatter_add.cpp
@@ -51,7 +51,7 @@ void scatter_add_helper(
 } // namespace
 
 Tensor& scatter_add_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
@@ -65,6 +65,15 @@ Tensor& scatter_add_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      context,
+      tensors_have_same_dim_order(self, src, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      context, tensor_is_default_dim_order(index), InvalidArgument, out);
+
   if (dim < 0) {
     dim += nonzero_dim(self);
   }
diff --git a/kernels/portable/cpu/op_select_copy.cpp b/kernels/portable/cpu/op_select_copy.cpp
index 140eea062de..8df9fd00cb7 100644
--- a/kernels/portable/cpu/op_select_copy.cpp
+++ b/kernels/portable/cpu/op_select_copy.cpp
@@ -18,7 +18,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& select_copy_int_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
     int64_t index,
diff --git a/kernels/portable/cpu/op_select_scatter.cpp b/kernels/portable/cpu/op_select_scatter.cpp
index 71e7d9dfefd..41e034aae02 100644
--- a/kernels/portable/cpu/op_select_scatter.cpp
+++ b/kernels/portable/cpu/op_select_scatter.cpp
@@ -22,7 +22,7 @@ using Tensor = exec_aten::Tensor;
 /// aten::select_scatter.out(Tensor self, Tensor src, int dim, SymInt index, *,
 /// Tensor(a!) out) -> Tensor(a!)
 Tensor& select_scatter_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const Tensor& src,
     int64_t dim,
@@ -33,6 +33,9 @@ Tensor& select_scatter_out(
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, src, out), InvalidArgument, out);
+
   // Account for negative indices
   if (dim < 0) {
     dim += in.dim();
diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp
index b696c29518b..84c4ea2f542 100644
--- a/kernels/portable/cpu/op_sigmoid.cpp
+++ b/kernels/portable/cpu/op_sigmoid.cpp
@@ -17,13 +17,16 @@ namespace native {
 
 using Tensor = exec_aten::Tensor;
 
-Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   ET_KERNEL_CHECK(
       ctx, in.scalar_type() != ScalarType::Bool, InvalidArgument, out);
   ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
       ctx,
diff --git a/kernels/portable/cpu/op_sign.cpp b/kernels/portable/cpu/op_sign.cpp
index 6dc6f3d015e..af3225d4779 100644
--- a/kernels/portable/cpu/op_sign.cpp
+++ b/kernels/portable/cpu/op_sign.cpp
@@ -19,7 +19,7 @@ namespace native {
 
 using exec_aten::Tensor;
 
-Tensor& sign_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& sign_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   // Resize for dynamic shape
@@ -30,6 +30,9 @@ Tensor& sign_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
 
diff --git a/kernels/portable/cpu/op_sin.cpp b/kernels/portable/cpu/op_sin.cpp
index a7d7905c7b0..102d0a5cd9d 100644
--- a/kernels/portable/cpu/op_sin.cpp
+++ b/kernels/portable/cpu/op_sin.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& sin_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& sin_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::sin, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_sinh.cpp b/kernels/portable/cpu/op_sinh.cpp
index e53246fbe72..b06a0a2b06c 100644
--- a/kernels/portable/cpu/op_sinh.cpp
+++ b/kernels/portable/cpu/op_sinh.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& sinh_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& sinh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::sinh, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_slice_copy.cpp b/kernels/portable/cpu/op_slice_copy.cpp
index d56bdcd864f..984bbd29977 100644
--- a/kernels/portable/cpu/op_slice_copy.cpp
+++ b/kernels/portable/cpu/op_slice_copy.cpp
@@ -6,8 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
-#include <executorch/kernels/portable/cpu/util/index_util.h>
+#include <executorch/kernels/portable/cpu/util/slice_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <cstring>
 
@@ -18,7 +17,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& slice_copy_Tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
     exec_aten::optional<int64_t> start_val,
@@ -34,6 +33,9 @@ Tensor& slice_copy_Tensor_out(
     dim += in.dim();
   }
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   // If user do not set value to end_val, set end to in.size(dim) (largest
   // value available)
   int64_t end = end_val.has_value() ? end_val.value() : in.size(dim);
@@ -41,40 +43,20 @@ Tensor& slice_copy_Tensor_out(
   // available)
   int64_t start = start_val.has_value() ? start_val.value() : 0;
 
-  int64_t num_values = adjust_slice_indices(in.size(dim), &start, &end, step);
+  int64_t length = adjust_slice_indices(in.size(dim), &start, &end, step);
 
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
   Tensor::SizesType target_sizes[kTensorDimensionLimit];
   size_t target_ndim = 0;
-  get_slice_copy_out_target_size(
-      in, dim, num_values, target_sizes, &target_ndim);
+  get_slice_copy_out_target_size(in, dim, length, target_sizes, &target_ndim);
   ET_KERNEL_CHECK(
       ctx,
       resize_tensor(out, {target_sizes, target_ndim}) == Error::Ok,
       InvalidArgument,
       out);
 
-  size_t dim_length = in.size(dim);
-
-  size_t leading_dims = getLeadingDims(in, dim);
-  size_t trailing_dims = getTrailingDims(in, dim);
-
-  if (trailing_dims == 0) {
-    return out;
-  }
-
-  size_t length_per_step = trailing_dims * in.element_size();
+  compute_slice(in, dim, start, length, step, out);
 
-  const char* input_data = in.const_data_ptr<char>();
-  char* dest = out.mutable_data_ptr<char>();
-
-  for (int i = 0; i < leading_dims; i++) {
-    const char* src = input_data + (i * dim_length + start) * length_per_step;
-    for (int j = 0; j < num_values; j++) {
-      memcpy(dest, src, length_per_step);
-      src += step * length_per_step;
-      dest += length_per_step;
-    }
-  }
   return out;
 }
 
diff --git a/kernels/portable/cpu/op_slice_scatter.cpp b/kernels/portable/cpu/op_slice_scatter.cpp
index 367b626696f..8b97ff3b3e2 100644
--- a/kernels/portable/cpu/op_slice_scatter.cpp
+++ b/kernels/portable/cpu/op_slice_scatter.cpp
@@ -9,7 +9,7 @@
 #include <cstdint>
 #include <cstring>
 
-#include <executorch/kernels/portable/cpu/util/index_util.h>
+#include <executorch/kernels/portable/cpu/util/slice_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -19,7 +19,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& slice_scatter_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& src,
     int64_t dim,
@@ -40,6 +40,9 @@ Tensor& slice_scatter_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(input, out), InvalidArgument, out);
+
   if (input.numel() == 0) {
     return out;
   }
@@ -74,8 +77,8 @@ Tensor& slice_scatter_out(
   ScalarType in_type = input.scalar_type();
   ScalarType src_type = src.scalar_type();
 
-  ET_SWITCH_REALHB_TYPES(in_type, ctx, "slice_scatter.out", CTYPE, [&]() {
-    ET_SWITCH_REALHB_TYPES(
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "slice_scatter.out", CTYPE, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(
         src_type, ctx, "slice_scatter.out", CTYPE_SRC, [&]() {
           CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
           const CTYPE_SRC* src_data = src.const_data_ptr<CTYPE_SRC>();
diff --git a/kernels/portable/cpu/op_softmax.cpp b/kernels/portable/cpu/op_softmax.cpp
index 9f1565ff161..09cd8befde8 100644
--- a/kernels/portable/cpu/op_softmax.cpp
+++ b/kernels/portable/cpu/op_softmax.cpp
@@ -20,7 +20,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& softmax_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
     bool half_to_float,
@@ -36,6 +36,9 @@ Tensor& softmax_out(
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   // Adjust for negative dim
   dim = dim < 0 ? dim + nonzero_dim(in) : dim;
 
diff --git a/kernels/portable/cpu/op_split_copy.cpp b/kernels/portable/cpu/op_split_copy.cpp
index a604e76b51c..0da792efb37 100644
--- a/kernels/portable/cpu/op_split_copy.cpp
+++ b/kernels/portable/cpu/op_split_copy.cpp
@@ -30,7 +30,7 @@ using TensorList = exec_aten::TensorList;
  * Tensor(a!)[] out) -> ()
  */
 void split_copy_Tensor_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     int64_t split_size,
     int64_t dim,
@@ -46,6 +46,11 @@ void split_copy_Tensor_out(
       check_split_copy_args(input, split_size, dim, out),
       InvalidArgument, );
 
+  for (size_t i = 0; i < out.size(); ++i) {
+    ET_KERNEL_CHECK(
+        ctx, tensors_have_same_dim_order(input, out[i]), InvalidArgument, );
+  }
+
   const size_t leading_dims = getLeadingDims(input, dim);
   const size_t trailing_dims = getTrailingDims(input, dim);
   const size_t step = input.size(dim) * trailing_dims;
diff --git a/kernels/portable/cpu/op_split_with_sizes_copy.cpp b/kernels/portable/cpu/op_split_with_sizes_copy.cpp
index 7d1b485e7a4..ab7dce1d1af 100644
--- a/kernels/portable/cpu/op_split_with_sizes_copy.cpp
+++ b/kernels/portable/cpu/op_split_with_sizes_copy.cpp
@@ -21,7 +21,7 @@ using Tensor = exec_aten::Tensor;
 using TensorList = exec_aten::TensorList;
 
 void split_with_sizes_copy_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     exec_aten::ArrayRef<int64_t> split_sizes,
     int64_t dim,
@@ -38,6 +38,11 @@ void split_with_sizes_copy_out(
       check_split_with_sizes_copy_args(in, split_sizes, dim, out),
       InvalidArgument, );
 
+  for (size_t i = 0; i < out.size(); ++i) {
+    ET_KERNEL_CHECK(
+        ctx, tensors_have_same_dim_order(in, out[i]), InvalidArgument, );
+  }
+
   // If out is empty, then nothing needs to be done after checking the args.
   // Valid args implies that in.size(dim) == 0 and split_sizes is also empty.
   if (out.size() == 0) {
diff --git a/kernels/portable/cpu/op_sqrt.cpp b/kernels/portable/cpu/op_sqrt.cpp
index 3b697d7d751..d1a88869f9a 100644
--- a/kernels/portable/cpu/op_sqrt.cpp
+++ b/kernels/portable/cpu/op_sqrt.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& sqrt_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& sqrt_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::sqrt, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_squeeze_copy.cpp b/kernels/portable/cpu/op_squeeze_copy.cpp
index 5be91ff827d..82ff1016e26 100644
--- a/kernels/portable/cpu/op_squeeze_copy.cpp
+++ b/kernels/portable/cpu/op_squeeze_copy.cpp
@@ -20,7 +20,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& squeeze_copy_dim_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
     Tensor& out) {
@@ -29,6 +29,11 @@ Tensor& squeeze_copy_dim_out(
   ET_KERNEL_CHECK(
       ctx, check_squeeze_copy_dim_args(in, dim, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   if (dim < 0) {
     dim += nonzero_dim(in);
   }
@@ -53,7 +58,7 @@ Tensor& squeeze_copy_dim_out(
 }
 
 Tensor& squeeze_copy_dims_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     exec_aten::ArrayRef<int64_t> dims,
     Tensor& out) {
@@ -62,6 +67,11 @@ Tensor& squeeze_copy_dims_out(
   ET_KERNEL_CHECK(
       ctx, check_squeeze_copy_dims_args(in, dims, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   Tensor::SizesType expected_out_size[kTensorDimensionLimit];
   size_t expected_out_dim = 0;
   get_squeeze_copy_dims_out_target_size(
diff --git a/kernels/portable/cpu/op_stack.cpp b/kernels/portable/cpu/op_stack.cpp
index f241120ae2f..d3cca7ea817 100644
--- a/kernels/portable/cpu/op_stack.cpp
+++ b/kernels/portable/cpu/op_stack.cpp
@@ -18,7 +18,7 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 
 Tensor& stack_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     exec_aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
@@ -31,6 +31,16 @@ Tensor& stack_out(
   ET_KERNEL_CHECK(
       ctx, check_stack_args(tensors, dim, out), InvalidArgument, out);
 
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(tensors[i], out),
+        InvalidArgument,
+        out);
+  }
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(out), InvalidArgument, out);
+
   Tensor::SizesType expected_out_size[kTensorDimensionLimit];
   size_t expected_out_dim = 0;
   get_stack_out_target_size(tensors, dim, expected_out_size, &expected_out_dim);
diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp
index 04254653a43..d366c40b771 100644
--- a/kernels/portable/cpu/op_sub.cpp
+++ b/kernels/portable/cpu/op_sub.cpp
@@ -67,7 +67,7 @@ struct SubInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
 } // namespace
 
 Tensor& sub_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     const Scalar& alpha,
@@ -78,6 +78,9 @@ Tensor& sub_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
@@ -114,7 +117,7 @@ Tensor& sub_out(
 }
 
 Tensor& sub_scalar_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
     const Scalar& alpha,
@@ -131,6 +134,9 @@ Tensor& sub_scalar_out(
 
   ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType alpha_type = utils::get_scalar_dtype(alpha);
diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp
index dfa897206a9..3c897c3487e 100644
--- a/kernels/portable/cpu/op_sum.cpp
+++ b/kernels/portable/cpu/op_sum.cpp
@@ -18,7 +18,7 @@ using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
 Tensor& sum_dim_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     optional<ArrayRef<int64_t>> dim_list,
     bool keepdim,
@@ -38,6 +38,11 @@ Tensor& sum_dim_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES_AND(
       Bool, in.scalar_type(), ctx, "sum.IntList_out", CTYPE_IN, [&] {
         ET_SWITCH_REAL_TYPES_AND(
diff --git a/kernels/portable/cpu/op_t_copy.cpp b/kernels/portable/cpu/op_t_copy.cpp
index c6a2ad5fdb5..4dd76228883 100644
--- a/kernels/portable/cpu/op_t_copy.cpp
+++ b/kernels/portable/cpu/op_t_copy.cpp
@@ -24,7 +24,7 @@ using Tensor = exec_aten::Tensor;
  * is equivalent to transpose(input, 0, 1).
  * t_copy.out(Tensor self, Tensor(a!) out)
  */
-Tensor& t_copy_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& t_copy_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
   ET_KERNEL_CHECK(ctx, check_t_copy_args(in, out), InvalidArgument, out);
@@ -47,6 +47,11 @@ Tensor& t_copy_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
     return out;
   }
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   Tensor::SizesType expected_out_size[kTensorDimensionLimit];
   size_t expected_out_dim = 0;
   get_transpose_out_target_size(in, 1, 0, expected_out_size, &expected_out_dim);
diff --git a/kernels/portable/cpu/op_tan.cpp b/kernels/portable/cpu/op_tan.cpp
index 4e98eb556f3..fa5ab083630 100644
--- a/kernels/portable/cpu/op_tan.cpp
+++ b/kernels/portable/cpu/op_tan.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& tan_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& tan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::tan, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_tanh.cpp b/kernels/portable/cpu/op_tanh.cpp
index ba2d6971592..0935c5bc93d 100644
--- a/kernels/portable/cpu/op_tanh.cpp
+++ b/kernels/portable/cpu/op_tanh.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& tanh_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& tanh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realhb_to_floath(std::tanh, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_to_copy.cpp b/kernels/portable/cpu/op_to_copy.cpp
index 7ecd4f3b5e1..586691a52e2 100644
--- a/kernels/portable/cpu/op_to_copy.cpp
+++ b/kernels/portable/cpu/op_to_copy.cpp
@@ -29,7 +29,7 @@ void _to_impl(const Tensor& self, Tensor& out) {
 // to_copy.out(Tensor self, *, bool non_blocking=False, MemoryFormat?
 // memory_format=None, Tensor(a!) out) -> Tensor(a!)
 Tensor& to_copy_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& self,
     bool non_blocking,
     exec_aten::optional<exec_aten::MemoryFormat> memory_format,
@@ -46,10 +46,16 @@ Tensor& to_copy_out(
       InvalidArgument,
       out);
 
-  ET_SWITCH_REALHB_TYPES(self.scalar_type(), ctx, "to_copy", CTYPE_IN, [&] {
-    ET_SWITCH_REALHB_TYPES(out.scalar_type(), ctx, "to_copy", CTYPE_OUT, [&] {
-      _to_impl<CTYPE_IN, CTYPE_OUT>(self, out);
-    });
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out);
+
+  ET_SWITCH_REALHBBF16_TYPES(self.scalar_type(), ctx, "to_copy", CTYPE_IN, [&] {
+    ET_SWITCH_REALHBBF16_TYPES(
+        out.scalar_type(), ctx, "to_copy", CTYPE_OUT, [&] {
+          _to_impl<CTYPE_IN, CTYPE_OUT>(self, out);
+        });
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp
new file mode 100644
index 00000000000..1c862c5761f
--- /dev/null
+++ b/kernels/portable/cpu/op_topk.cpp
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cmath>
+#include <tuple>
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+namespace {
+
+bool check_topk_args(
+    const Tensor& in,
+    int64_t k,
+    int64_t dim,
+    Tensor& values,
+    Tensor& indices) {
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, values));
+  ET_LOG_AND_RETURN_IF_FALSE(indices.scalar_type() == ScalarType::Long);
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim));
+  if (dim < 0) {
+    dim += nonzero_dim(in);
+  }
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      k >= 0 && k <= nonempty_size(in, dim), "selected index k out of range");
+  return true;
+}
+
+bool get_topk_target_size(
+    const Tensor& in,
+    int64_t k,
+    int64_t dim,
+    Tensor::SizesType* target_size,
+    size_t* target_dim) {
+  *target_dim = in.dim();
+  for (size_t i = 0; i < *target_dim; ++i) {
+    if (i == dim) {
+      target_size[i] = k;
+    } else {
+      target_size[i] = in.size(i);
+    }
+  }
+  return true;
+}
+
+template <typename CTYPE, typename elem_t = std::pair<CTYPE, int64_t>>
+void perform_topk(
+    const Tensor& in,
+    int64_t k,
+    int64_t dim,
+    bool largest,
+    bool sorted,
+    Tensor& values,
+    Tensor& indices,
+    elem_t* queue) {
+  const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
+  CTYPE* values_data = values.mutable_data_ptr<CTYPE>();
+  long* indices_data = indices.mutable_data_ptr<long>();
+
+  if (in.dim() == 0) {
+    values_data[0] = in_data[0];
+    indices_data[0] = 0;
+    return;
+  }
+
+  if (k == 0) {
+    return;
+  }
+
+  const size_t outer_size = getLeadingDims(in, dim);
+
+  const size_t dim_size = in.size(dim);
+  const size_t dim_stride = in.strides()[dim];
+
+  const size_t outer_stride_in = dim_size * dim_stride;
+  const size_t outer_stride_out = k * dim_stride;
+
+  bool use_partial_sort = k * 64 <= dim_size;
+
+  // Loop through all outer dimensions
+  for (size_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
+    size_t outer_in = outer_idx * outer_stride_in;
+    size_t outer_out = outer_idx * outer_stride_out;
+    // Loop through all inner dimensions
+    for (size_t inner_idx = 0; inner_idx < dim_stride; ++inner_idx) {
+      size_t base_in = outer_in + inner_idx;
+      size_t base_out = outer_out + inner_idx;
+
+      // Populate the queue with the values from the input tensor
+      for (size_t i = 0; i < dim_size; ++i) {
+        size_t in_ix = base_in + i * dim_stride;
+        queue[i].first = in_data[in_ix];
+        queue[i].second = i;
+      }
+
+      // Perform topk on the queue
+      if (use_partial_sort) {
+        if (largest) {
+          std::partial_sort(
+              queue,
+              queue + k,
+              queue + dim_size,
+              [](const elem_t& x, const elem_t& y) -> bool {
+                return (
+                    (std::isnan(x.first) && !std::isnan(y.first)) ||
+                    (x.first > y.first));
+              });
+        } else {
+          std::partial_sort(
+              queue,
+              queue + k,
+              queue + dim_size,
+              [](const elem_t& x, const elem_t& y) -> bool {
+                return (
+                    (!std::isnan(x.first) && std::isnan(y.first)) ||
+                    (x.first < y.first));
+              });
+        }
+      } else {
+        if (largest) {
+          std::nth_element(
+              queue,
+              queue + k - 1,
+              queue + dim_size,
+              [](const elem_t& x, const elem_t& y) -> bool {
+                return (
+                    (std::isnan(x.first) && !std::isnan(y.first)) ||
+                    (x.first > y.first));
+              });
+          if (sorted) {
+            std::sort(
+                queue,
+                queue + k - 1,
+                [](const elem_t& x, const elem_t& y) -> bool {
+                  return (
+                      (std::isnan(x.first) && !std::isnan(y.first)) ||
+                      (x.first > y.first));
+                });
+          }
+        } else {
+          std::nth_element(
+              queue,
+              queue + k - 1,
+              queue + dim_size,
+              [](const elem_t& x, const elem_t& y) -> bool {
+                return (
+                    (!std::isnan(x.first) && std::isnan(y.first)) ||
+                    (x.first < y.first));
+              });
+          if (sorted) {
+            std::sort(
+                queue,
+                queue + k - 1,
+                [](const elem_t& x, const elem_t& y) -> bool {
+                  return (
+                      (!std::isnan(x.first) && std::isnan(y.first)) ||
+                      (x.first < y.first));
+                });
+          }
+        }
+      }
+
+      // Write the topk values and indices to the output tensors
+      for (size_t i = 0; i < k; ++i) {
+        size_t out_ix = base_out + i * dim_stride;
+
+        values_data[out_ix] = queue[i].first;
+        indices_data[out_ix] = queue[i].second;
+      }
+    }
+  }
+}
+
+void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size) {
+  Result<void*> temp_mem_res = ctx.allocate_temp(size);
+  return temp_mem_res.ok() ? temp_mem_res.get() : nullptr;
+}
+
+} // namespace
+
+std::tuple<Tensor&, Tensor&> topk_values(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    int64_t k,
+    int64_t dim,
+    bool largest,
+    bool sorted,
+    Tensor& values,
+    Tensor& indices) {
+  auto out = std::tuple<Tensor&, Tensor&>({values, indices});
+
+  ET_KERNEL_CHECK(
+      ctx, check_topk_args(in, k, dim, values, indices), InvalidArgument, out);
+
+  if (dim < 0) {
+    dim += nonzero_dim(in);
+  }
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  Tensor::SizesType target_size[kTensorDimensionLimit];
+  size_t target_dim = 0;
+  get_topk_target_size(in, k, dim, target_size, &target_dim);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(values, {target_size, target_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(indices, {target_size, target_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  constexpr auto name = "topk.values";
+
+  if (in.numel() == 0 || (k == 0 && in.dim() > 0)) {
+    return out;
+  }
+
+  bool temp_mem_allocated = false;
+
+  ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, name, CTYPE, [&]() {
+    using elem_t = std::pair<CTYPE, int64_t>;
+    size_t temp_mem_size = nonempty_size(in, dim) * sizeof(elem_t);
+
+    elem_t* queue = (elem_t*)allocate_temp_memory(ctx, temp_mem_size);
+    if (queue == nullptr) {
+      return;
+    }
+    temp_mem_allocated = true;
+
+    perform_topk<CTYPE>(in, k, dim, largest, sorted, values, indices, queue);
+  });
+
+  ET_KERNEL_CHECK(ctx, temp_mem_allocated, MemoryAllocationFailed, out);
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/op_transpose_copy.cpp b/kernels/portable/cpu/op_transpose_copy.cpp
index 79c04646a73..616cec51a6f 100644
--- a/kernels/portable/cpu/op_transpose_copy.cpp
+++ b/kernels/portable/cpu/op_transpose_copy.cpp
@@ -25,7 +25,7 @@ using Tensor = exec_aten::Tensor;
  * transpose_copy.int_out(Tensor self, int dim0, int dim1, *, Tensor(a!) out)
  */
 Tensor& transpose_copy_int_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim0,
     int64_t dim1,
@@ -57,6 +57,9 @@ Tensor& transpose_copy_int_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_SWITCH_ALL_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] {
     transpose_tensors<CTYPE>(in, dim0, dim1, out);
   });
diff --git a/kernels/portable/cpu/op_tril.cpp b/kernels/portable/cpu/op_tril.cpp
index cdf87bea4ba..54a9b5bdee3 100644
--- a/kernels/portable/cpu/op_tril.cpp
+++ b/kernels/portable/cpu/op_tril.cpp
@@ -57,7 +57,7 @@ void apply_tril(
  */
 template <typename CTYPE>
 void tril_kernel(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& self,
     int64_t diagonal,
     const Tensor& out) {
@@ -131,7 +131,7 @@ void tril_kernel(
  *       main one are also captured.
  */
 Tensor& tril_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& self,
     int64_t diagonal,
     Tensor& out) {
@@ -145,6 +145,11 @@ Tensor& tril_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out);
+
   if (self.numel() == 0) {
     return out;
   }
diff --git a/kernels/portable/cpu/op_trunc.cpp b/kernels/portable/cpu/op_trunc.cpp
index fbd32c670a4..a14a2b18e2f 100644
--- a/kernels/portable/cpu/op_trunc.cpp
+++ b/kernels/portable/cpu/op_trunc.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-Tensor& trunc_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& trunc_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realh(std::trunc, ctx, in, out);
 }
 
diff --git a/kernels/portable/cpu/op_unbind_copy.cpp b/kernels/portable/cpu/op_unbind_copy.cpp
index da5a73d624c..1f4930bacd0 100644
--- a/kernels/portable/cpu/op_unbind_copy.cpp
+++ b/kernels/portable/cpu/op_unbind_copy.cpp
@@ -23,7 +23,7 @@ using TensorList = exec_aten::TensorList;
  * unbind_copy.int_out(Tensor input, int dim=0, *, Tensor(a!)[] out) -> ()
  */
 void unbind_copy_int_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     int64_t dim,
     TensorList out) {
@@ -36,6 +36,13 @@ void unbind_copy_int_out(
   ET_KERNEL_CHECK(
       ctx, check_unbind_copy_args(input, dim, out), InvalidArgument, );
 
+  for (int i = 0; i < out.size(); ++i) {
+    ET_KERNEL_CHECK(
+        ctx, tensors_have_same_dim_order(input, out[i]), InvalidArgument, );
+  }
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(input), InvalidArgument, );
+
   if (input.numel() == 0) {
     return;
   }
diff --git a/kernels/portable/cpu/op_unsqueeze_copy.cpp b/kernels/portable/cpu/op_unsqueeze_copy.cpp
index f6d25a04983..3b74033d9d4 100644
--- a/kernels/portable/cpu/op_unsqueeze_copy.cpp
+++ b/kernels/portable/cpu/op_unsqueeze_copy.cpp
@@ -21,7 +21,7 @@ using Tensor = exec_aten::Tensor;
 // unsqueeze_copy.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
 // -> Tensor(a!)
 Tensor& unsqueeze_copy_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& self,
     int64_t dim,
     Tensor& out) {
diff --git a/kernels/portable/cpu/op_var.cpp b/kernels/portable/cpu/op_var.cpp
index 52019e381c0..e5499fa064e 100644
--- a/kernels/portable/cpu/op_var.cpp
+++ b/kernels/portable/cpu/op_var.cpp
@@ -57,7 +57,7 @@ void compute_variance(
 } // namespace
 
 Tensor& var_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     optional<ArrayRef<int64_t>> dim_list,
     bool unbiased,
@@ -74,6 +74,11 @@ Tensor& var_out(
   ET_KERNEL_CHECK(ctx, tensor_is_floating_type(in), InvalidArgument, out);
   ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   ET_KERNEL_CHECK(
       ctx,
       resize_reduction_out(in, dim_list, keepdim, out) == Error::Ok,
@@ -95,7 +100,7 @@ Tensor& var_out(
 }
 
 Tensor& var_correction_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     optional<ArrayRef<int64_t>> dim_list,
     const optional<Scalar>& correction,
diff --git a/kernels/portable/cpu/op_view_copy.cpp b/kernels/portable/cpu/op_view_copy.cpp
index f7174caac1e..323786197fa 100644
--- a/kernels/portable/cpu/op_view_copy.cpp
+++ b/kernels/portable/cpu/op_view_copy.cpp
@@ -20,7 +20,7 @@ using Tensor = exec_aten::Tensor;
 
 // view_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!)
 Tensor& view_copy_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& self,
     exec_aten::ArrayRef<int64_t> size_int64_t,
     Tensor& out) {
@@ -44,6 +44,11 @@ Tensor& view_copy_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out);
+
   ET_KERNEL_CHECK(
       ctx, check_view_copy_args(self, size_int64_t, out), InvalidArgument, out);
 
diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp
index bf42447582e..a7736247597 100644
--- a/kernels/portable/cpu/op_where.cpp
+++ b/kernels/portable/cpu/op_where.cpp
@@ -15,7 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& where_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& cond,
     const Tensor& a,
     const Tensor& b,
@@ -35,14 +35,17 @@ Tensor& where_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(cond, a, b, out), InvalidArgument, out);
+
   constexpr auto name = "where.self_out";
 
   ET_CHECK_MSG(
       cond_type == ScalarType::Bool || cond_type == ScalarType::Byte,
       "Unhandled dtype %s for where.self_out",
       torch::executor::toString(cond_type));
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
-    ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
       using CTYPE_OUT =
           typename torch::executor::promote_types<CTYPE_A, CTYPE_B>::type;
       apply_ternary_elementwise_fn<CTYPE_A, CTYPE_B, uint8_t, CTYPE_OUT>(
diff --git a/kernels/portable/cpu/op_zeros.cpp b/kernels/portable/cpu/op_zeros.cpp
index 4157046f68a..fcb1e0a7987 100644
--- a/kernels/portable/cpu/op_zeros.cpp
+++ b/kernels/portable/cpu/op_zeros.cpp
@@ -38,7 +38,7 @@ bool check_sizes(
  *
  * zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
  */
-Tensor& zeros_out(RuntimeContext& ctx, IntArrayRef size, Tensor& out) {
+Tensor& zeros_out(KernelRuntimeContext& ctx, IntArrayRef size, Tensor& out) {
   (void)ctx;
 
   // Resize for dynamic shape
diff --git a/kernels/portable/cpu/pattern/binary_ufunc_realb_realb_to_realb_logical.cpp b/kernels/portable/cpu/pattern/binary_ufunc_realb_realb_to_realb_logical.cpp
index 22d30b39a47..0c454cae792 100644
--- a/kernels/portable/cpu/pattern/binary_ufunc_realb_realb_to_realb_logical.cpp
+++ b/kernels/portable/cpu/pattern/binary_ufunc_realb_realb_to_realb_logical.cpp
@@ -17,7 +17,7 @@ namespace internal {
 
 Tensor& binary_ufunc_realb_realb_to_realb_logical(
     bool (*fn)(bool, bool),
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
@@ -27,6 +27,9 @@ Tensor& binary_ufunc_realb_realb_to_realb_logical(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
diff --git a/kernels/portable/cpu/pattern/pattern.h b/kernels/portable/cpu/pattern/pattern.h
index c6fab62de3b..db3c0c59b35 100644
--- a/kernels/portable/cpu/pattern/pattern.h
+++ b/kernels/portable/cpu/pattern/pattern.h
@@ -61,7 +61,7 @@ namespace internal {
  */
 Tensor& unary_ufunc_realh(
     double (*fn)(double),
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     Tensor& out);
 
@@ -73,7 +73,7 @@ Tensor& unary_ufunc_realh(
  */
 Tensor& unary_ufunc_realhb_to_bool(
     bool (*fn)(double),
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     Tensor& out);
 
@@ -85,7 +85,7 @@ Tensor& unary_ufunc_realhb_to_bool(
  */
 Tensor& unary_ufunc_realhb_to_floath(
     double (*fn)(double),
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     Tensor& out);
 
@@ -97,7 +97,7 @@ Tensor& unary_ufunc_realhb_to_floath(
  */
 Tensor& binary_ufunc_realb_realb_to_realb_logical(
     bool (*fn)(bool, bool),
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     Tensor& out);
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp
index cb29ac42594..16d847ace31 100644
--- a/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp
@@ -17,7 +17,7 @@ namespace internal {
 
 Tensor& unary_ufunc_realh(
     double (*fn)(double),
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     Tensor& out) {
   (void)ctx;
@@ -33,6 +33,9 @@ Tensor& unary_ufunc_realh(
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] {
     apply_unary_map_fn(
         [fn](const CTYPE val_in) { return static_cast<CTYPE>(fn(val_in)); },
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
index bf5e84710ef..76bef5fe021 100644
--- a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
@@ -17,7 +17,7 @@ namespace internal {
 
 Tensor& unary_ufunc_realhb_to_bool(
     bool (*fn)(double),
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     Tensor& out) {
   (void)ctx;
@@ -38,6 +38,9 @@ Tensor& unary_ufunc_realhb_to_bool(
       "Expected out tensor to have dtype Bool, but got %" PRId8 " instead.",
       static_cast<int8_t>(out.scalar_type()));
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   const auto in_type = in.scalar_type();
 
   ET_SWITCH_REALHB_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] {
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp
index 47d442d8bd4..31ec04dfed0 100644
--- a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp
@@ -17,7 +17,7 @@ namespace internal {
 
 Tensor& unary_ufunc_realhb_to_floath(
     double (*fn)(double),
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     Tensor& out) {
   (void)ctx;
@@ -32,6 +32,9 @@ Tensor& unary_ufunc_realhb_to_floath(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   const auto in_type = in.scalar_type();
   const auto out_type = out.scalar_type();
 
diff --git a/kernels/portable/cpu/scalar_utils.h b/kernels/portable/cpu/scalar_utils.h
index 3daf3e72526..3d6dfb75e47 100644
--- a/kernels/portable/cpu/scalar_utils.h
+++ b/kernels/portable/cpu/scalar_utils.h
@@ -94,12 +94,6 @@ struct promote_type_with_scalar_type {
   static_assert(
       !is_bits_type<T1>::value,
       "promote_type_with_scalar_type not valid for bits dtypes");
-  static_assert(
-      !std::is_same<
-          T1,
-          typename ScalarTypeToCppType<exec_aten::ScalarType::BFloat16>::type>::
-          value,
-      "promote_type_with_scalar_type not valid for BFloat16");
   using promote_type_with_scalar_type_not_respecting_half_to_float =
       typename std::conditional<
           is_complex_type<T1>::value ||
@@ -119,10 +113,14 @@ struct promote_type_with_scalar_type {
  public:
   using type = typename std::conditional<
       half_to_float &&
-          std::is_same<
-              promote_type_with_scalar_type_not_respecting_half_to_float,
-              typename ScalarTypeToCppType<exec_aten::ScalarType::Half>::type>::
-              value,
+          (std::is_same<
+               promote_type_with_scalar_type_not_respecting_half_to_float,
+               typename ScalarTypeToCppType<
+                   exec_aten::ScalarType::Half>::type>::value ||
+           std::is_same<
+               promote_type_with_scalar_type_not_respecting_half_to_float,
+               typename ScalarTypeToCppType<
+                   exec_aten::ScalarType::BFloat16>::type>::value),
       typename ScalarTypeToCppType<exec_aten::ScalarType::Float>::type,
       promote_type_with_scalar_type_not_respecting_half_to_float>::type;
 };
diff --git a/kernels/portable/cpu/test/scalar_utils_test.cpp b/kernels/portable/cpu/test/scalar_utils_test.cpp
index 82539f02a0b..1983f707da1 100644
--- a/kernels/portable/cpu/test/scalar_utils_test.cpp
+++ b/kernels/portable/cpu/test/scalar_utils_test.cpp
@@ -16,7 +16,7 @@ struct promote_type_with_scalar_type_is_valid
           (std::is_same<T2, torch::executor::internal::B1>::value ||
            std::is_same<T2, torch::executor::internal::I8>::value ||
            std::is_same<T2, torch::executor::internal::F8>::value) &&
-              !std::is_same<T1, torch::executor::BFloat16>::value &&
+              !std::is_same<T1, exec_aten::BFloat16>::value &&
               !torch::executor::is_qint_type<T1>::value &&
               !torch::executor::is_bits_type<T1>::value> {};
 
diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp
index 314e38c2b53..a46ccbf241a 100644
--- a/kernels/portable/cpu/util/copy_ops_util.cpp
+++ b/kernels/portable/cpu/util/copy_ops_util.cpp
@@ -95,6 +95,8 @@ bool check_cat_args(
     ET_LOG_AND_RETURN_IF_FALSE(
         canCast(tensors[i].scalar_type(), out.scalar_type()));
 
+    ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dim_order(tensors[i], out));
+
     // Empty tensors have no shape constraints.
     if (tensors[i].numel() == 0) {
       continue;
@@ -325,6 +327,19 @@ bool check_pixel_shuffle_args(
   return true;
 }
 
+bool check_pixel_unshuffle_args(
+    const Tensor& in,
+    int64_t downscale_factor,
+    Tensor& out) {
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_has_rank_greater_or_equal_to(in, 3));
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_has_rank_greater_or_equal_to(out, 3));
+  ET_LOG_AND_RETURN_IF_FALSE(downscale_factor > 0);
+  ET_LOG_AND_RETURN_IF_FALSE(in.size(in.dim() - 1) % downscale_factor == 0);
+  ET_LOG_AND_RETURN_IF_FALSE(in.size(in.dim() - 2) % downscale_factor == 0);
+  return true;
+}
+
 void get_pixel_shuffle_out_target_size(
     const Tensor& in,
     int64_t upscale_factor,
@@ -347,6 +362,29 @@ void get_pixel_shuffle_out_target_size(
   out_sizes[i] = in.size(i) * casted_upscale_factor;
 }
 
+void get_pixel_unshuffle_out_target_size(
+    const Tensor& in,
+    int64_t downscale_factor,
+    exec_aten::SizesType* out_sizes,
+    size_t* out_ndim) {
+  *out_ndim = in.dim();
+  const exec_aten::SizesType casted_factor = downscale_factor;
+
+  size_t i = 0;
+  for (; i < in.dim() - 3; ++i) {
+    // Copy all leading dimensions in.
+    out_sizes[i] = in.size(i);
+  }
+  // The last 3 dimensions are (channel, height, width). Multiply channel by
+  // the downscale factor squared and divide the height and width by that
+  // factor.
+  out_sizes[i] = in.size(i) * (casted_factor * casted_factor);
+  i++;
+  out_sizes[i] = in.size(i) / casted_factor;
+  i++;
+  out_sizes[i] = in.size(i) / casted_factor;
+}
+
 bool check_select_copy_out_args(
     const Tensor& in,
     int64_t dim,
@@ -375,33 +413,6 @@ void get_select_copy_out_target_size(
   }
 }
 
-bool check_slice_copy_args(
-    const Tensor& in,
-    int64_t dim,
-    int64_t step,
-    Tensor& out) {
-  ET_LOG_AND_RETURN_IF_FALSE(in.dim() > 0);
-  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
-  ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      step > 0, "slice step must be greater than zero");
-  return true;
-}
-
-void get_slice_copy_out_target_size(
-    const Tensor& in,
-    int64_t dim,
-    int64_t num_values,
-    exec_aten::SizesType* out_sizes,
-    size_t* out_ndim) {
-  *out_ndim = in.dim();
-
-  for (size_t d = 0; d < in.dim(); ++d) {
-    out_sizes[d] = in.size(d);
-  }
-  out_sizes[dim] = num_values;
-}
-
 bool check_split_with_sizes_copy_args(
     const Tensor& in,
     exec_aten::ArrayRef<int64_t> split_sizes,
diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h
index d5362ae373a..91c62e707e9 100644
--- a/kernels/portable/cpu/util/copy_ops_util.h
+++ b/kernels/portable/cpu/util/copy_ops_util.h
@@ -113,28 +113,26 @@ void get_pixel_shuffle_out_target_size(
     exec_aten::SizesType* out_sizes,
     size_t* out_ndim);
 
-bool check_select_copy_out_args(
+bool check_pixel_unshuffle_args(
     const Tensor& in,
-    int64_t dim,
-    int64_t index,
+    int64_t upscale_factor,
     Tensor& out);
 
-void get_select_copy_out_target_size(
+void get_pixel_unshuffle_out_target_size(
     const Tensor& in,
-    int64_t dim,
+    int64_t upscale_factor,
     exec_aten::SizesType* out_sizes,
     size_t* out_ndim);
 
-bool check_slice_copy_args(
+bool check_select_copy_out_args(
     const Tensor& in,
     int64_t dim,
-    int64_t step,
+    int64_t index,
     Tensor& out);
 
-void get_slice_copy_out_target_size(
+void get_select_copy_out_target_size(
     const Tensor& in,
     int64_t dim,
-    int64_t num_values,
     exec_aten::SizesType* out_sizes,
     size_t* out_ndim);
 
diff --git a/kernels/portable/cpu/util/index_util.cpp b/kernels/portable/cpu/util/index_util.cpp
index 109983c8122..39c556fa01c 100644
--- a/kernels/portable/cpu/util/index_util.cpp
+++ b/kernels/portable/cpu/util/index_util.cpp
@@ -12,6 +12,51 @@
 namespace torch {
 namespace executor {
 
+bool check_gather_args(
+    const Tensor& in,
+    int64_t dim,
+    const Tensor& index,
+    bool sparse_grad,
+    Tensor& out) {
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim));
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      index.scalar_type() == ScalarType::Long,
+      "Expected dypte int64 for index");
+  if (index.numel() != 0) {
+    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+        nonzero_dim(in) == nonzero_dim(index),
+        "self and index should have the same dimensionality when index is not empty "
+        "except for the case when one has dimension 0 and the other has dimension 1");
+  }
+
+  // Normalize dim to non-negative value
+  if (dim < 0) {
+    dim += nonzero_dim(in);
+  }
+
+  for (size_t d = 0; d < nonzero_dim(in); ++d) {
+    if (d != dim) {
+      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+          nonempty_size(index, d) <= nonempty_size(in, d),
+          "size of dimension %zd of index should be smaller than the size of that dimension of input if dimension %zd != dim %zd",
+          d,
+          d,
+          (size_t)dim);
+    }
+  }
+  const long* index_data = index.const_data_ptr<long>();
+  for (size_t i = 0; i < index.numel(); ++i) {
+    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+        index_data[i] >= 0 && index_data[i] < nonempty_size(in, dim),
+        "Index is out of bounds for dimension %zd with size %zd",
+        (size_t)dim,
+        nonempty_size(index, dim));
+  }
+
+  return true;
+}
+
 bool check_index_select_args(
     const Tensor& in,
     int64_t dim,
@@ -78,6 +123,22 @@ void get_index_select_out_target_size(
   }
 }
 
+bool check_nonzero_args(const Tensor& in, const Tensor& out) {
+  (void)in;
+
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      out.scalar_type() == ScalarType::Long,
+      "Expected out to be a Long tensor but received %" PRId8,
+      static_cast<int8_t>(out.scalar_type()));
+
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      out.dim() == 2,
+      "Expected out to be a 2d tensor received %zd",
+      ssize_t(out.dim()));
+
+  return true;
+}
+
 bool check_scatter_add_args(
     const Tensor& self,
     int64_t dim,
@@ -130,38 +191,74 @@ bool check_scatter_add_args(
   return true;
 }
 
-int64_t adjust_slice_indices(
-    int64_t dim_length,
-    int64_t* start,
-    int64_t* end,
-    int64_t step) {
-  int64_t num_values = 0;
-
-  // Update start and end index
-  // First convert it to c++ style from python style if needed.
-  // The start index is using python style E.g., for the shape {2, 3, 4},
-  // dim = -1 would refer to dim[2], dim = -2 would refer to dim[1], and so on.
-  *start = *start < 0 ? *start + dim_length : *start;
-  *end = *end < 0 ? *end + dim_length : *end;
-  // Second, if start or end still negative, which means user want to start or
-  // end slicing from very beginning, so set it to zero
-  *start = *start < 0 ? 0 : *start;
-  *end = *end < 0 ? 0 : *end;
-  // Last, if start or end larger than maximum value (dim_length - 1), indicates
-  // user want to start slicing after end or slicing until the end, so update it
-  // to dim_length
-  *start = *start > dim_length ? dim_length : *start;
-  *end = *end > dim_length ? dim_length : *end;
-
-  if (*start >= dim_length || *end <= 0 || *start >= *end) {
-    // Set num_values to 0 if interval [start, end) is non-exist or do not
-    // overlap with [0, dim_length)
-    num_values = 0;
-  } else {
-    // Update num_values to min(max_num_values, num_values)
-    num_values = (*end - 1 - *start) / step + 1;
+bool check_scatter_src_args(
+    const Tensor& self,
+    int64_t dim,
+    const Tensor& index,
+    const Tensor& src,
+    Tensor& out) {
+  return check_scatter_add_args(self, dim, index, src, out);
+}
+
+bool check_scatter_value_args(
+    const Tensor& self,
+    int64_t dim,
+    const Tensor& index,
+    const Scalar& value,
+    Tensor& out) {
+  return check_gather_args(self, dim, index, false, out);
+}
+
+bool check_select_scatter_args(
+    const Tensor& in,
+    const Tensor& src,
+    int64_t dim,
+    int64_t index,
+    Tensor& output) {
+  /**
+   * Assumptions for inputs:
+   * 1. output size is the same as input size
+   * 2. src size is the same as the selected slice from the input
+   * 3. dim and index values are valid given the input tensor
+   */
+
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, output));
+
+  // The dim planed to be selected on shall exist in input
+  ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(dim, in.dim()));
+
+  // The index shall be valid in the given dimenson
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      index >= 0 && index < in.size(dim),
+      "index %" PRId64 " out of range [-%zd,%zd) at in.size( %" PRId64 ")",
+      index,
+      in.size(dim),
+      in.size(dim),
+      dim);
+
+  // The src.dim() shall be one lower than in.dim() since src needs to fit
+  // into the selected data on one dim of input
+  // https://pytorch.org/docs/stable/generated/torch.select_scatter.html
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      in.dim() == src.dim() + 1,
+      "in.dim() %zd != src.dim() + 1 %zd",
+      in.dim(),
+      src.dim() + 1);
+
+  // The size of src tensor should follow these rules:
+  // - src.size(i) shall equal to in.size(i) if i < dim,
+  // - src.size(i) shall equal to in.size(i+1) if i >= dim
+
+  for (ssize_t d = 0; d < in.dim() - 1; d++) {
+    if (d < dim) {
+      ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_size_at_dims(in, d, src, d));
+    } else {
+      ET_LOG_AND_RETURN_IF_FALSE(
+          tensors_have_same_size_at_dims(in, d + 1, src, d));
+    }
   }
-  return num_values;
+
+  return true;
 }
 
 } // namespace executor
diff --git a/kernels/portable/cpu/util/index_util.h b/kernels/portable/cpu/util/index_util.h
index 9677ac90bc6..0ee430c9726 100644
--- a/kernels/portable/cpu/util/index_util.h
+++ b/kernels/portable/cpu/util/index_util.h
@@ -14,6 +14,13 @@
 namespace torch {
 namespace executor {
 
+bool check_gather_args(
+    const Tensor& in,
+    int64_t dim,
+    const Tensor& index,
+    bool sparse_grad,
+    Tensor& output);
+
 bool check_index_select_args(
     const Tensor& in,
     int64_t dim,
@@ -27,6 +34,8 @@ void get_index_select_out_target_size(
     exec_aten::SizesType* out_sizes,
     size_t* out_ndim);
 
+bool check_nonzero_args(const Tensor& in, const Tensor& out);
+
 bool check_scatter_add_args(
     const Tensor& self,
     int64_t dim,
@@ -34,21 +43,19 @@ bool check_scatter_add_args(
     const Tensor& src,
     Tensor& out);
 
-bool check_nonzero_args(const Tensor& in, const Tensor& out);
-
-bool check_slice_scatter_args(
-    const Tensor& input,
+bool check_scatter_src_args(
+    const Tensor& self,
+    int64_t dim,
+    const Tensor& index,
     const Tensor& src,
+    Tensor& out);
+
+bool check_scatter_value_args(
+    const Tensor& self,
     int64_t dim,
-    int64_t num_values,
-    int64_t step,
-    Tensor output);
-
-int64_t adjust_slice_indices(
-    int64_t dim_length,
-    int64_t* start,
-    int64_t* end,
-    int64_t step);
+    const Tensor& index,
+    const Scalar& value,
+    Tensor& out);
 
 bool check_select_scatter_args(
     const Tensor& in,
diff --git a/kernels/portable/cpu/util/kernel_ops_util.cpp b/kernels/portable/cpu/util/kernel_ops_util.cpp
index d34b41ab797..649526c94bf 100644
--- a/kernels/portable/cpu/util/kernel_ops_util.cpp
+++ b/kernels/portable/cpu/util/kernel_ops_util.cpp
@@ -326,7 +326,7 @@ bool check_convolution_args(
     bool transposed,
     IntArrayRef output_padding,
     int64_t groups,
-    Tensor& out) {
+    const Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, weight, out));
 
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_default_or_channels_last_dim_order(in));
@@ -519,117 +519,6 @@ void get_max_pool2d_with_indices_out_target_size(
       in, 2, kernel_size, stride, padding, dilation, out_sizes, ceil_mode);
 }
 
-bool check_slice_scatter_args(
-    const Tensor& input,
-    const Tensor& src,
-    int64_t dim,
-    int64_t num_values,
-    int64_t step,
-    Tensor output) {
-  ET_LOG_AND_RETURN_IF_FALSE(input.dim() > 0);
-
-  // Check dim. The dim planed to be selected on shall exist in input
-  ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(dim, input.dim()));
-
-  // Input and output tensors should be the same shape and dtype
-  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_shape_and_dtype(input, output));
-
-  // The input.dim() shall equal to src.dim()
-  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_rank(input, src));
-
-  // Check step. Step must be greater than zero
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      step > 0, "slice step must be greater than zero");
-
-  // The size of src tensor should follow these rules:
-  // - src.size(i) shall equal to input.size(i) if i != dim,
-  // - src.size(dim) shall equal to num_values
-  for (size_t d = 0; d < input.dim() - 1; d++) {
-    if (d != dim) {
-      ET_LOG_AND_RETURN_IF_FALSE(
-          tensors_have_same_size_at_dims(input, d, src, d));
-    } else {
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
-          src.size(d) == num_values,
-          "input.size(%zu) %zd != num_values %" PRId64 " | dim = %" PRId64 ")",
-          d,
-          input.size(d),
-          num_values,
-          dim);
-    }
-  }
-
-  return true;
-}
-
-bool check_select_scatter_args(
-    const Tensor& in,
-    const Tensor& src,
-    int64_t dim,
-    int64_t index,
-    Tensor& output) {
-  /**
-   * Assumptions for inputs:
-   * 1. output size is the same as input size
-   * 2. src size is the same as the selected slice from the input
-   * 3. dim and index values are valid given the input tensor
-   */
-
-  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, output));
-
-  // The dim planed to be selected on shall exist in input
-  ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(dim, in.dim()));
-
-  // The index shall be valid in the given dimenson
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      index >= 0 && index < in.size(dim),
-      "index %" PRId64 " out of range [-%zd,%zd) at in.size( %" PRId64 ")",
-      index,
-      in.size(dim),
-      in.size(dim),
-      dim);
-
-  // The src.dim() shall be one lower than in.dim() since src needs to fit
-  // into the selected data on one dim of input
-  // https://pytorch.org/docs/stable/generated/torch.select_scatter.html
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      in.dim() == src.dim() + 1,
-      "in.dim() %zd != src.dim() + 1 %zd",
-      in.dim(),
-      src.dim() + 1);
-
-  // The size of src tensor should follow these rules:
-  // - src.size(i) shall equal to in.size(i) if i < dim,
-  // - src.size(i) shall equal to in.size(i+1) if i >= dim
-
-  for (ssize_t d = 0; d < in.dim() - 1; d++) {
-    if (d < dim) {
-      ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_size_at_dims(in, d, src, d));
-    } else {
-      ET_LOG_AND_RETURN_IF_FALSE(
-          tensors_have_same_size_at_dims(in, d + 1, src, d));
-    }
-  }
-
-  return true;
-}
-
-bool check_nonzero_args(const Tensor& in, const Tensor& out) {
-  (void)in;
-
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      out.scalar_type() == ScalarType::Long,
-      "Expected out to be a Long tensor but received %" PRId8,
-      static_cast<int8_t>(out.scalar_type()));
-
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      out.dim() == 2,
-      "Expected out to be a 2d tensor received %zd",
-      ssize_t(out.dim()));
-
-  return true;
-}
-
 bool check_masked_fill_args(
     const Tensor& in,
     const Tensor& mask,
diff --git a/kernels/portable/cpu/util/kernel_ops_util.h b/kernels/portable/cpu/util/kernel_ops_util.h
index c18269490b7..6b06e231f59 100644
--- a/kernels/portable/cpu/util/kernel_ops_util.h
+++ b/kernels/portable/cpu/util/kernel_ops_util.h
@@ -411,7 +411,7 @@ bool check_convolution_args(
     bool transposed,
     IntArrayRef output_padding,
     int64_t groups,
-    Tensor& out);
+    const Tensor& out);
 
 void get_convolution_out_target_size(
     const Tensor& in,
@@ -451,29 +451,6 @@ void get_max_pool2d_with_indices_out_target_size(
     exec_aten::SizesType* out_sizes,
     size_t* out_ndim);
 
-bool check_nonzero_args(const Tensor& in, const Tensor& out);
-
-bool check_slice_scatter_args(
-    const Tensor& input,
-    const Tensor& src,
-    int64_t dim,
-    int64_t num_values,
-    int64_t step,
-    Tensor output);
-
-int64_t adjust_slice_indices(
-    int64_t dim_length,
-    int64_t* start,
-    int64_t* end,
-    int64_t step);
-
-bool check_select_scatter_args(
-    const Tensor& in,
-    const Tensor& src,
-    int64_t dim,
-    int64_t index,
-    Tensor& output);
-
 bool check_masked_fill_args(
     const Tensor& in,
     const Tensor& mask,
diff --git a/kernels/portable/cpu/util/math_util.h b/kernels/portable/cpu/util/math_util.h
index df175147062..05935fff389 100644
--- a/kernels/portable/cpu/util/math_util.h
+++ b/kernels/portable/cpu/util/math_util.h
@@ -96,9 +96,8 @@ INT_T max_override(INT_T a, INT_T b) {
 
 template <
     typename T,
-    typename std::enable_if<
-        std::is_same<T, torch::executor::Half>::value,
-        bool>::type = true>
+    typename std::enable_if<std::is_same<T, exec_aten::Half>::value, bool>::
+        type = true>
 T min_override(T a, T b) {
   const auto float_a = static_cast<float>(a);
   if (std::isnan(float_a)) {
@@ -117,9 +116,8 @@ T min_override(T a, T b) {
 
 template <
     typename T,
-    typename std::enable_if<
-        std::is_same<T, torch::executor::Half>::value,
-        bool>::type = true>
+    typename std::enable_if<std::is_same<T, exec_aten::Half>::value, bool>::
+        type = true>
 T max_override(T a, T b) {
   const auto float_a = static_cast<float>(a);
   if (std::isnan(float_a)) {
diff --git a/kernels/portable/cpu/util/matmul_ops_util.cpp b/kernels/portable/cpu/util/matmul_ops_util.cpp
index d7e49d64958..3d4f2e5e9ba 100644
--- a/kernels/portable/cpu/util/matmul_ops_util.cpp
+++ b/kernels/portable/cpu/util/matmul_ops_util.cpp
@@ -71,6 +71,19 @@ bool check_mm_args(const Tensor& in, const Tensor& mat2, Tensor& out) {
   return true;
 }
 
+bool check_linear_args(const Tensor& in, const Tensor& mat2, Tensor& out) {
+  ET_LOG_AND_RETURN_IF_FALSE(in.dim() == out.dim());
+  ET_LOG_AND_RETURN_IF_FALSE(in.dim() >= 2);
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(mat2, 2));
+
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, mat2, out));
+
+  ET_LOG_AND_RETURN_IF_FALSE(
+      tensors_have_same_size_at_dims(in, in.dim() - 1, mat2, 1));
+
+  return true;
+}
+
 void get_mm_out_target_size(
     const Tensor& mat1,
     const Tensor& mat2,
@@ -81,5 +94,17 @@ void get_mm_out_target_size(
   out_sizes[1] = mat2.size(1);
 }
 
+void get_linear_out_target_size(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    Tensor::SizesType* out_sizes,
+    size_t* out_ndim) {
+  *out_ndim = mat1.dim();
+  for (int ii = 0; ii < mat1.dim() - 1; ++ii) {
+    out_sizes[ii] = mat1.sizes()[ii];
+  }
+  out_sizes[mat1.dim() - 1] = mat2.size(0);
+}
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/matmul_ops_util.h b/kernels/portable/cpu/util/matmul_ops_util.h
index 91e27ff2cc9..d2991868e95 100644
--- a/kernels/portable/cpu/util/matmul_ops_util.h
+++ b/kernels/portable/cpu/util/matmul_ops_util.h
@@ -37,5 +37,13 @@ void get_mm_out_target_size(
     Tensor::SizesType* out_sizes,
     size_t* out_ndim);
 
+bool check_linear_args(const Tensor& in, const Tensor& mat2, Tensor& out);
+
+void get_linear_out_target_size(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    Tensor::SizesType* out_sizes,
+    size_t* out_ndim);
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/select_copy_util.cpp b/kernels/portable/cpu/util/select_copy_util.cpp
index cf56b3e4ca2..2564317b043 100644
--- a/kernels/portable/cpu/util/select_copy_util.cpp
+++ b/kernels/portable/cpu/util/select_copy_util.cpp
@@ -38,6 +38,10 @@ Error select_copy_util(
     return Error::InvalidArgument;
   }
 
+  if (!tensors_have_same_dim_order(in, out)) {
+    return Error::InvalidArgument;
+  }
+
   // If the input is a empty tensor, no other operation could be done. We just
   // return the output.
   if (in.numel() == 0) {
diff --git a/kernels/portable/cpu/util/slice_util.cpp b/kernels/portable/cpu/util/slice_util.cpp
new file mode 100644
index 00000000000..b9f5260e626
--- /dev/null
+++ b/kernels/portable/cpu/util/slice_util.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/slice_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <cstring>
+
+namespace torch {
+namespace executor {
+
+using Tensor = exec_aten::Tensor;
+
+bool check_narrow_copy_args(
+    const Tensor& in,
+    int64_t dim,
+    int64_t start,
+    int64_t lenth,
+    Tensor& out) {
+  ET_LOG_AND_RETURN_IF_FALSE(in.dim() > 0);
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim));
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(lenth >= 0, "lenth must be non-negative");
+  ET_LOG_AND_RETURN_IF_FALSE(start >= -in.size(dim));
+  ET_LOG_AND_RETURN_IF_FALSE(start <= in.size(dim));
+  if (start < 0) {
+    start += in.size(dim);
+  }
+  ET_LOG_AND_RETURN_IF_FALSE(start + lenth <= in.size(dim));
+  return true;
+}
+
+void get_narrow_copy_out_target_size(
+    const Tensor& in,
+    int64_t dim,
+    int64_t length,
+    exec_aten::SizesType* out_sizes,
+    size_t* out_ndim) {
+  *out_ndim = in.dim();
+
+  for (size_t d = 0; d < in.dim(); ++d) {
+    out_sizes[d] = in.size(d);
+  }
+  out_sizes[dim] = length;
+}
+
+bool check_slice_copy_args(
+    const Tensor& in,
+    int64_t dim,
+    int64_t step,
+    Tensor& out) {
+  ET_LOG_AND_RETURN_IF_FALSE(in.dim() > 0);
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim));
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      step > 0, "slice step must be greater than zero");
+  return true;
+}
+
+void get_slice_copy_out_target_size(
+    const Tensor& in,
+    int64_t dim,
+    int64_t length,
+    exec_aten::SizesType* out_sizes,
+    size_t* out_ndim) {
+  get_narrow_copy_out_target_size(in, dim, length, out_sizes, out_ndim);
+}
+
+bool check_slice_scatter_args(
+    const Tensor& input,
+    const Tensor& src,
+    int64_t dim,
+    int64_t num_values,
+    int64_t step,
+    Tensor output) {
+  ET_LOG_AND_RETURN_IF_FALSE(input.dim() > 0);
+
+  // Check dim. The dim planed to be selected on shall exist in input
+  ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(dim, input.dim()));
+
+  // Input and output tensors should be the same shape and dtype
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_shape_and_dtype(input, output));
+
+  // The input.dim() shall equal to src.dim()
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_rank(input, src));
+
+  // Check step. Step must be greater than zero
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      step > 0, "slice step must be greater than zero");
+
+  // The size of src tensor should follow these rules:
+  // - src.size(i) shall equal to input.size(i) if i != dim,
+  // - src.size(dim) shall equal to num_values
+  for (size_t d = 0; d < input.dim() - 1; d++) {
+    if (d != dim) {
+      ET_LOG_AND_RETURN_IF_FALSE(
+          tensors_have_same_size_at_dims(input, d, src, d));
+    } else {
+      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+          src.size(d) == num_values,
+          "input.size(%zu) %zd != num_values %" PRId64 " | dim = %" PRId64 ")",
+          d,
+          input.size(d),
+          num_values,
+          dim);
+    }
+  }
+
+  return true;
+}
+
+int64_t adjust_slice_indices(
+    int64_t dim_length,
+    int64_t* start,
+    int64_t* end,
+    int64_t step) {
+  int64_t num_values = 0;
+
+  // Update start and end index
+  // First convert it to c++ style from python style if needed.
+  // The start index is using python style E.g., for the shape {2, 3, 4},
+  // dim = -1 would refer to dim[2], dim = -2 would refer to dim[1], and so on.
+  *start = *start < 0 ? *start + dim_length : *start;
+  *end = *end < 0 ? *end + dim_length : *end;
+  // Second, if start or end still negative, which means user want to start or
+  // end slicing from very beginning, so set it to zero
+  *start = *start < 0 ? 0 : *start;
+  *end = *end < 0 ? 0 : *end;
+  // Last, if start or end larger than maximum value (dim_length - 1), indicates
+  // user want to start slicing after end or slicing until the end, so update it
+  // to dim_length
+  *start = *start > dim_length ? dim_length : *start;
+  *end = *end > dim_length ? dim_length : *end;
+
+  if (*start >= dim_length || *end <= 0 || *start >= *end) {
+    // Set num_values to 0 if interval [start, end) is non-exist or do not
+    // overlap with [0, dim_length)
+    num_values = 0;
+  } else {
+    // Update num_values to min(max_num_values, num_values)
+    num_values = (*end - 1 - *start) / step + 1;
+  }
+  return num_values;
+}
+
+void compute_slice(
+    const Tensor& in,
+    int64_t dim,
+    int64_t start,
+    int64_t length,
+    int64_t step,
+    Tensor& out) {
+  size_t dim_length = in.size(dim);
+
+  size_t leading_dims = getLeadingDims(in, dim);
+  size_t trailing_dims = getTrailingDims(in, dim);
+
+  if (trailing_dims == 0) {
+    return;
+  }
+
+  size_t length_per_step = trailing_dims * in.element_size();
+
+  const char* input_data = in.const_data_ptr<char>();
+  char* dest = out.mutable_data_ptr<char>();
+
+  for (int i = 0; i < leading_dims; i++) {
+    const char* src = input_data + (i * dim_length + start) * length_per_step;
+    for (int j = 0; j < length; j++) {
+      memcpy(dest, src, length_per_step);
+      src += step * length_per_step;
+      dest += length_per_step;
+    }
+  }
+}
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/slice_util.h b/kernels/portable/cpu/util/slice_util.h
new file mode 100644
index 00000000000..734f0dd3c6d
--- /dev/null
+++ b/kernels/portable/cpu/util/slice_util.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+
+bool check_narrow_copy_args(
+    const Tensor& in,
+    int64_t dim,
+    int64_t start,
+    int64_t length,
+    Tensor& out);
+
+void get_narrow_copy_out_target_size(
+    const Tensor& in,
+    int64_t dim,
+    int64_t length,
+    exec_aten::SizesType* out_sizes,
+    size_t* out_ndim);
+
+bool check_slice_copy_args(
+    const Tensor& in,
+    int64_t dim,
+    int64_t step,
+    Tensor& out);
+
+void get_slice_copy_out_target_size(
+    const Tensor& in,
+    int64_t dim,
+    int64_t num_values,
+    exec_aten::SizesType* out_sizes,
+    size_t* out_ndim);
+
+bool check_slice_scatter_args(
+    const Tensor& input,
+    const Tensor& src,
+    int64_t dim,
+    int64_t num_values,
+    int64_t step,
+    Tensor output);
+
+int64_t adjust_slice_indices(
+    int64_t dim_length,
+    int64_t* start,
+    int64_t* end,
+    int64_t step);
+
+void compute_slice(
+    const Tensor& in,
+    int64_t dim,
+    int64_t start,
+    int64_t length,
+    int64_t step,
+    Tensor& out);
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index bd55b4da304..324b31ebf0a 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -29,6 +29,7 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:distance_util",
             "//executorch/kernels/portable/cpu/util:select_copy_util",
             "//executorch/kernels/portable/cpu/util:advanced_index_util",
+            "//executorch/kernels/portable/cpu/util:slice_util",
         ],
         visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"],
     )
@@ -226,6 +227,16 @@ def define_common_targets():
         visibility = ["//executorch/kernels/portable/cpu/..."],
     )
 
+    runtime.cxx_library(
+        name = "slice_util",
+        srcs = ["slice_util.cpp"],
+        exported_headers = ["slice_util.h"],
+        deps = [
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+        visibility = ["//executorch/kernels/portable/cpu/..."],
+    )
+
     # Utility functions that can be used by operators that perform reduction
     for aten_mode in [True, False]:
         suffix = "_aten" if aten_mode else ""
@@ -238,5 +249,9 @@ def define_common_targets():
                 "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
             ],
             exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
-            visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/quantized/..."],
+            visibility = [
+                "//executorch/extension/llm/custom_ops/...",
+                "//executorch/kernels/portable/cpu/...",
+                "//executorch/kernels/quantized/...",
+            ],
         )
diff --git a/kernels/portable/cpu/util/test/broadcast_test.cpp b/kernels/portable/cpu/util/test/broadcast_test.cpp
index 168977eb831..9ad6555d94e 100644
--- a/kernels/portable/cpu/util/test/broadcast_test.cpp
+++ b/kernels/portable/cpu/util/test/broadcast_test.cpp
@@ -20,8 +20,8 @@
 using namespace ::testing;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using torch::executor::ArrayRef;
-using torch::executor::testing::TensorFactory;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::testing::TensorFactory;
 
 TEST(BroadcastUtilTest, BroadcastTensor) {
   TensorFactory<ScalarType::Int> tf;
@@ -112,17 +112,17 @@ TEST(BroadcastUtilTest, GetBroadcastTargetSize) {
   Tensor a = tf.zeros({2, 1});
   Tensor b = tf.zeros({5, 1, 2});
 
-  get_broadcast_target_size(
+  executorch::runtime::Error err = get_broadcast_target_size(
       a,
       b,
       expected_output_size,
       torch::executor::kTensorDimensionLimit,
       &expected_output_dim);
+  EXPECT_EQ(err, torch::executor::Error::Ok);
 
   EXPECT_TRUE(
-      torch::executor::ArrayRef<Tensor::SizesType>(
-          expected_output_size, expected_output_dim)
-          .equals(torch::executor::ArrayRef<Tensor::SizesType>({5, 2, 2})));
+      ArrayRef<Tensor::SizesType>(expected_output_size, expected_output_dim)
+          .equals(ArrayRef<Tensor::SizesType>({5, 2, 2})));
 }
 
 size_t linearize_indexes(size_t* indexes, size_t indexes_len, const Tensor& t) {
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index 93256b2a05f..5136ea0a12f 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -248,6 +248,11 @@
     - arg_meta: null
       kernel_name: torch::executor::convolution_out
 
+- op: convolution_backward.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::convolution_backward_out
+
 - op: copy.out
   kernels:
     - arg_meta: null
@@ -392,6 +397,11 @@
     - arg_meta: null
       kernel_name: torch::executor::full_like_out
 
+- op: gather.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::gather_out
+
 - op: ge.Scalar_out
   kernels:
     - arg_meta: null
@@ -577,6 +587,11 @@
     - arg_meta: null
       kernel_name: torch::executor::mul_scalar_out
 
+- op: narrow_copy.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::narrow_copy_out
+
 - op: native_group_norm.out
   kernels:
     - arg_meta: null
@@ -622,6 +637,11 @@
     - arg_meta: null
       kernel_name: torch::executor::pixel_shuffle_out
 
+- op: pixel_unshuffle.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::pixel_unshuffle_out
+
 - op: pow.Scalar_out
   kernels:
     - arg_meta: null
@@ -727,6 +747,16 @@
     - arg_meta: null
       kernel_name: torch::executor::scalar_tensor_out
 
+- op: scatter.src_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::scatter_src_out
+
+- op: scatter.value_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::scatter_value_out
+
 - op: scatter_add.out
   kernels:
     - arg_meta: null
@@ -832,6 +862,11 @@
     - arg_meta: null
       kernel_name: torch::executor::tanh_out
 
+- op: topk.values
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::topk_values
+
 - op: transpose_copy.int_out
   kernels:
     - arg_meta: null
diff --git a/kernels/portable/test/op_gelu_test.cpp b/kernels/portable/test/op_gelu_test.cpp
index ed6b8db4299..7bd3964aedf 100644
--- a/kernels/portable/test/op_gelu_test.cpp
+++ b/kernels/portable/test/op_gelu_test.cpp
@@ -25,7 +25,7 @@ using torch::executor::testing::TensorFactory;
 // executorch/kernels/test/op_gelu_test.cpp instead.
 
 Tensor& op_gelu_out(const Tensor& self, string_view approximate, Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::native::gelu_out(context, self, approximate, out);
 }
 
diff --git a/kernels/portable/test/op_mul_test.cpp b/kernels/portable/test/op_mul_test.cpp
index 0c7f300dcfc..c95fdf32253 100644
--- a/kernels/portable/test/op_mul_test.cpp
+++ b/kernels/portable/test/op_mul_test.cpp
@@ -36,19 +36,18 @@ class OpMulOutKernelTest : public OperatorTest {
 TEST_F(OpMulOutKernelTest, UnhandledDtypeDies) {
   // mul_out() doesn't handle QInt8.
   // TensorFactory cannot be used with ScalarType::QInt8 since
-  // torch::executor::qint8 does not have a default constructor. It must be
+  // exec_aten::qint8 does not have a default constructor. It must be
   // initialized with an explicit value. So, we need to manually create the
   // underlying data without default construction and then the tensors from that
   // data via TensorImpl.
 
   std::vector<SizesType> sizes = {2, 2};
 
-  std::vector<torch::executor::qint8> a_data{};
-  std::generate_n(std::back_inserter(a_data), 4, []() {
-    return torch::executor::qint8{0};
-  });
-  std::vector<torch::executor::qint8> b_data(a_data);
-  std::vector<torch::executor::qint8> out_data(a_data);
+  std::vector<exec_aten::qint8> a_data{};
+  std::generate_n(
+      std::back_inserter(a_data), 4, []() { return exec_aten::qint8{0}; });
+  std::vector<exec_aten::qint8> b_data(a_data);
+  std::vector<exec_aten::qint8> out_data(a_data);
 
   auto a_impl = torch::executor::TensorImpl(
       ScalarType::QInt8, 2, sizes.data(), a_data.data());
diff --git a/kernels/prim_ops/et_copy_index.cpp b/kernels/prim_ops/et_copy_index.cpp
index 40cf9a7e556..cad15173f78 100644
--- a/kernels/prim_ops/et_copy_index.cpp
+++ b/kernels/prim_ops/et_copy_index.cpp
@@ -64,7 +64,7 @@ constexpr size_t kTensorDimensionLimit = 16;
 //
 // The output of each iteration (copy_from) is copied into the copy_to tensor at
 // the specified index. This operator is supported in both ATen and lean modes.
-void et_copy_index(RuntimeContext& context, EValue** stack) {
+void et_copy_index(KernelRuntimeContext& context, EValue** stack) {
   (void)context;
   SizesType expected_output_size[kTensorDimensionLimit];
 
diff --git a/kernels/prim_ops/et_copy_index.h b/kernels/prim_ops/et_copy_index.h
index 2c238b399e1..7fdc452dc85 100644
--- a/kernels/prim_ops/et_copy_index.h
+++ b/kernels/prim_ops/et_copy_index.h
@@ -15,7 +15,7 @@ namespace torch {
 namespace executor {
 namespace function {
 
-void et_copy_index(RuntimeContext& context, EValue** stack);
+void et_copy_index(KernelRuntimeContext& context, EValue** stack);
 
 } // namespace function
 } // namespace executor
diff --git a/kernels/prim_ops/et_view.cpp b/kernels/prim_ops/et_view.cpp
index b3d3592fe7b..894996e32f7 100644
--- a/kernels/prim_ops/et_view.cpp
+++ b/kernels/prim_ops/et_view.cpp
@@ -64,7 +64,7 @@ bool get_view_target_size(
 }
 } // namespace
 
-void et_view(RuntimeContext& context, EValue** stack) {
+void et_view(KernelRuntimeContext& context, EValue** stack) {
   (void)context;
 
   auto self = (*stack[0]).toTensor();
diff --git a/kernels/prim_ops/et_view.h b/kernels/prim_ops/et_view.h
index 7a0cb3d75f7..927193dfb03 100644
--- a/kernels/prim_ops/et_view.h
+++ b/kernels/prim_ops/et_view.h
@@ -15,7 +15,7 @@ namespace torch {
 namespace executor {
 namespace function {
 
-void et_view(RuntimeContext& context, EValue** stack);
+void et_view(KernelRuntimeContext& context, EValue** stack);
 
 } // namespace function
 } // namespace executor
diff --git a/kernels/prim_ops/register_prim_ops.cpp b/kernels/prim_ops/register_prim_ops.cpp
index 721dc8e47a0..2670310a994 100644
--- a/kernels/prim_ops/register_prim_ops.cpp
+++ b/kernels/prim_ops/register_prim_ops.cpp
@@ -12,7 +12,6 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/kernel/operator_registry.h>
 
-using KernelArrayRef = ::torch::executor::ArrayRef<::torch::executor::Kernel>;
 using torch::executor::function::et_copy_index;
 
 namespace torch {
@@ -71,7 +70,7 @@ static Kernel prim_ops[] = {
     // aten::sym_size.int(Tensor self, int dim) -> SymInt
     Kernel(
         "aten::sym_size.int",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           (void)context;
           EValue& self = *stack[0];
           EValue& dim = *stack[1];
@@ -84,7 +83,7 @@ static Kernel prim_ops[] = {
     // aten::_local_scalar_dense(Tensor self) -> Scalar
     Kernel(
         "aten::_local_scalar_dense",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           (void)context;
           EValue& self = *stack[0];
           EValue& out = *stack[1];
@@ -101,7 +100,7 @@ static Kernel prim_ops[] = {
     // aten::sym_numel(Tensor self) -> SymInt
     Kernel(
         "aten::sym_numel",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           (void)context;
           EValue& self = *stack[0];
           EValue& out = *stack[1];
@@ -112,7 +111,7 @@ static Kernel prim_ops[] = {
     // executorch_prim::add.Scalar(Scalar, Scalar) -> Scalar
     Kernel(
         "executorch_prim::add.Scalar",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           (void)context;
           ALGEBRA_ET_PRIM_OP(+, stack, context);
         }),
@@ -120,14 +119,14 @@ static Kernel prim_ops[] = {
     // executorch_prim::sub.Scalar(Scalar, Scalar) -> Scalar
     Kernel(
         "executorch_prim::sub.Scalar",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           ALGEBRA_ET_PRIM_OP(-, stack, context);
         }),
 
     // executorch_prim::mul.Scalar(Scalar, Scalar) -> Scalar
     Kernel(
         "executorch_prim::mul.Scalar",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           ALGEBRA_ET_PRIM_OP(*, stack, context);
         }),
 
@@ -142,7 +141,7 @@ static Kernel prim_ops[] = {
      */
     Kernel(
         "executorch_prim::floordiv.Scalar",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           (void)context;
           EValue& a = *stack[0];
           EValue& b = *stack[1];
@@ -171,7 +170,7 @@ static Kernel prim_ops[] = {
     // executorch_prim::floordiv.Scalar(Scalar, Scalar) -> Scalar
     Kernel(
         "executorch_prim::truediv.Scalar",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           // can't use macro because of custom casting behavior
           (void)context;
           EValue& a = *stack[0];
@@ -196,7 +195,7 @@ static Kernel prim_ops[] = {
     // executorch_prim::sym_float.Scalar(Scalar) -> Scalar
     Kernel(
         "executorch_prim::sym_float.Scalar",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           // can't use macro because of custom casting behavior
           // TODO: Now that we are reliably generating conversion operators,
           // we can remove the mixed type handling for other operators
@@ -217,42 +216,58 @@ static Kernel prim_ops[] = {
     // executorch_prim::eq.Scalar(Scalar, Scalar) -> bool
     Kernel(
         "executorch_prim::eq.Scalar",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           BOOLEAN_ET_PRIM_OP(==, stack, context);
         }),
 
     // executorch_prim::gt.Scalar(Scalar, Scalar) -> bool
     Kernel(
         "executorch_prim::gt.Scalar",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           BOOLEAN_ET_PRIM_OP(>, stack, context);
         }),
 
     // executorch_prim::lt.Scalar(Scalar, Scalar) -> bool
     Kernel(
         "executorch_prim::lt.Scalar",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           BOOLEAN_ET_PRIM_OP(<, stack, context);
         }),
 
     // executorch_prim::ge.Scalar(Scalar, Scalar) -> bool
     Kernel(
         "executorch_prim::ge.Scalar",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           BOOLEAN_ET_PRIM_OP(>=, stack, context);
         }),
 
     // executorch_prim::le.Scalar(Scalar, Scalar) -> bool
     Kernel(
         "executorch_prim::le.Scalar",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           BOOLEAN_ET_PRIM_OP(<=, stack, context);
         }),
+    // executorch_prim::neg.Scalar(Scalar) -> Scalar
+    Kernel(
+        "executorch_prim::neg.Scalar",
+        [](KernelRuntimeContext& context, EValue** stack) {
+          (void)context;
+          EValue& a = *stack[0];
+          EValue& out = *stack[1];
+          if (a.isInt()) {
+            out = EValue(-a.toInt());
+          } else if (a.isDouble()) {
+            out = EValue(-a.toDouble());
+          } else {
+            // TODO Fail using runtime context
+            ET_CHECK_MSG(false, "%zu", (size_t)a.tag);
+          }
+        }),
 
     // executorch_prim::floordiv.int(int, int) -> int
     Kernel(
         "executorch_prim::floordiv.int",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           (void)context;
           EValue& a = *stack[0];
           EValue& b = *stack[1];
@@ -263,7 +278,7 @@ static Kernel prim_ops[] = {
     // executorch_prim::mod.int(int, int) -> int
     Kernel(
         "executorch_prim::mod.int",
-        [](RuntimeContext& context, EValue** stack) {
+        [](KernelRuntimeContext& context, EValue** stack) {
           (void)context;
           EValue& a = *stack[0];
           EValue& b = *stack[1];
@@ -278,13 +293,14 @@ static Kernel prim_ops[] = {
 
 };
 
-static KernelArrayRef kernel_array_ref(
+executorch::runtime::Span<const executorch::runtime::Kernel> kernel_span(
     prim_ops,
     prim_ops + sizeof(prim_ops) / sizeof(Kernel));
 
 // Return value not used. Keep the static variable assignment to register
 // operators in static initialization time.
-static auto success_with_kernel_reg = register_kernels(kernel_array_ref);
+auto success_with_kernel_reg =
+    executorch::runtime::register_kernels(kernel_span);
 
 } // namespace
 } // namespace function
diff --git a/kernels/prim_ops/test/CMakeLists.txt b/kernels/prim_ops/test/CMakeLists.txt
new file mode 100644
index 00000000000..a25e15acc48
--- /dev/null
+++ b/kernels/prim_ops/test/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+cmake_minimum_required(VERSION 3.19)
+project(prim_ops_test)
+
+# Use C++17 for test.
+set(CMAKE_CXX_STANDARD 17)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+
+include(${EXECUTORCH_ROOT}/build/Test.cmake)
+
+target_link_options_shared_lib(executorch)
+
+set(_test_srcs prim_ops_test.cpp)
+
+et_cxx_test(kernels_prim_ops_test SOURCES ${_test_srcs})
diff --git a/kernels/prim_ops/test/prim_ops_test.cpp b/kernels/prim_ops/test/prim_ops_test.cpp
index 50802a3e321..240e42a97ce 100644
--- a/kernels/prim_ops/test/prim_ops_test.cpp
+++ b/kernels/prim_ops/test/prim_ops_test.cpp
@@ -29,10 +29,10 @@ namespace executor {
 
 class RegisterPrimOpsTest : public ::testing::Test {
  protected:
-  RuntimeContext context;
+  KernelRuntimeContext context;
   void SetUp() override {
     torch::executor::runtime_init();
-    context = RuntimeContext();
+    context = KernelRuntimeContext();
   }
 };
 
@@ -280,6 +280,53 @@ TEST_F(RegisterPrimOpsTest, LocalScalarDenseReturnsCorrectValue) {
   EXPECT_EQ(stack[1]->toInt(), expected);
 }
 
+TEST_F(RegisterPrimOpsTest, NegScalarReturnsCorrectValue) {
+  EValue values[2];
+
+  // Test with float
+  values[0] = EValue(5.0f);
+  values[1] = EValue(0.0f);
+
+  EValue* stack[2];
+  for (size_t i = 0; i < 2; i++) {
+    stack[i] = &values[i];
+  }
+
+  getOpsFn("executorch_prim::neg.Scalar")(context, stack);
+
+  EXPECT_EQ(stack[1]->toDouble(), -5.0f);
+
+  // Test with int
+  int64_t a = 5;
+  int64_t b = 0;
+  values[0] = EValue(a);
+  values[1] = EValue(b);
+
+  getOpsFn("executorch_prim::neg.Scalar")(context, stack);
+
+  EXPECT_EQ(stack[1]->toInt(), -5l);
+}
+
+TEST_F(RegisterPrimOpsTest, TestNegScalarWithTensorDies) {
+  testing::TensorFactory<ScalarType::Int> tf;
+
+  EValue values[2];
+
+  auto tensor = tf.make({2, 3}, {1, 2, 3, 4, 5, 6});
+
+  int64_t zero = 0;
+  values[0] = EValue(tensor);
+  values[1] = EValue(zero);
+
+  EValue* stack[2];
+  for (size_t i = 0; i < 2; i++) {
+    stack[i] = &values[i];
+  }
+
+  // Try to negate a tensor, which should cause a runtime error.
+  ET_EXPECT_DEATH(getOpsFn("executorch_prim::neg.Scalar")(context, stack), "");
+}
+
 TEST_F(RegisterPrimOpsTest, TestETView) {
   EXPECT_TRUE(hasOpsFn("executorch_prim::et_view.default"));
 
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
index c649a793ed3..f073835c935 100644
--- a/kernels/quantized/CMakeLists.txt
+++ b/kernels/quantized/CMakeLists.txt
@@ -87,6 +87,34 @@ if(NOT CMAKE_GENERATOR STREQUAL "Xcode"
     gen_custom_ops_aot_lib(
       LIB_NAME "quantized_ops_aot_lib" KERNEL_SOURCES "${_quantized_sources}"
     )
+
+    # Register quantized ops to portable_lib, so that they're available via
+    # pybindings.
+    if(TARGET portable_lib)
+      add_library(quantized_pybind_kernels_lib ${_quantized_kernels__srcs})
+      target_link_libraries(quantized_pybind_kernels_lib PRIVATE portable_lib)
+      target_compile_options(
+        quantized_pybind_kernels_lib PUBLIC ${_common_compile_options}
+      )
+      target_include_directories(
+        quantized_pybind_kernels_lib PUBLIC "${_common_include_directories}"
+      )
+      gen_selected_ops(
+        LIB_NAME "quantized_ops_pybind_lib" OPS_SCHEMA_YAML "${_yaml_file}"
+      )
+      generate_bindings_for_kernels(
+        LIB_NAME "quantized_ops_pybind_lib" CUSTOM_OPS_YAML "${_yaml_file}"
+      )
+      # Build a library for pybind usage. quantized_ops_pybind_lib: Register
+      # quantized ops kernels into Executorch runtime for pybind.
+      gen_operators_lib(
+        LIB_NAME "quantized_ops_pybind_lib" KERNEL_LIBS
+        quantized_pybind_kernels_lib DEPS portable_lib
+      )
+      target_link_libraries(
+        quantized_ops_aot_lib PUBLIC quantized_ops_pybind_lib
+      )
+    endif()
   endif()
 endif()
 
diff --git a/kernels/quantized/cpu/op_add.cpp b/kernels/quantized/cpu/op_add.cpp
index fa947f8bd52..279169e3d17 100644
--- a/kernels/quantized/cpu/op_add.cpp
+++ b/kernels/quantized/cpu/op_add.cpp
@@ -163,7 +163,7 @@ Tensor& quantized_add_out(
 }
 
 Tensor& quantized_add_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& a,
     double a_scale,
     int64_t a_zero_point,
diff --git a/kernels/quantized/cpu/op_choose_qparams.cpp b/kernels/quantized/cpu/op_choose_qparams.cpp
index a95387219d0..13b26e0b738 100644
--- a/kernels/quantized/cpu/op_choose_qparams.cpp
+++ b/kernels/quantized/cpu/op_choose_qparams.cpp
@@ -165,7 +165,7 @@ std::tuple<Tensor&, Tensor&> choose_qparams_tensor_out(
 }
 
 ::std::tuple<Tensor&, Tensor&> choose_qparams_tensor_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& input,
     int64_t quant_min,
     int64_t quant_max,
diff --git a/kernels/quantized/cpu/op_dequantize.cpp b/kernels/quantized/cpu/op_dequantize.cpp
index 0722a733cbe..836af474015 100644
--- a/kernels/quantized/cpu/op_dequantize.cpp
+++ b/kernels/quantized/cpu/op_dequantize.cpp
@@ -324,7 +324,7 @@ Tensor& dequantize_per_channel_out(
 }
 
 Tensor& dequantize_per_channel_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& scale,
     const optional<Tensor>& opt_zero_points,
@@ -348,7 +348,7 @@ Tensor& dequantize_per_channel_out(
 }
 
 Tensor& dequantize_per_tensor_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& input,
     double scale,
     int64_t zero_point,
@@ -365,7 +365,7 @@ Tensor& dequantize_per_tensor_out(
 }
 
 Tensor& dequantize_per_tensor_tensor_args_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& scale,
     const Tensor& zero_point,
diff --git a/kernels/quantized/cpu/op_embedding.cpp b/kernels/quantized/cpu/op_embedding.cpp
index eb9845b0c50..dda1d6027d9 100644
--- a/kernels/quantized/cpu/op_embedding.cpp
+++ b/kernels/quantized/cpu/op_embedding.cpp
@@ -250,7 +250,7 @@ Tensor& quantized_embedding_byte_out(
 }
 
 Tensor& quantized_embedding_byte_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
     const optional<Tensor>& opt_weight_zero_points,
@@ -313,7 +313,7 @@ Tensor& quantized_embedding_byte_dtype_out(
 }
 
 Tensor& quantized_embedding_byte_dtype_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
     const optional<Tensor>& opt_weight_zero_points,
diff --git a/kernels/quantized/cpu/op_embedding4b.cpp b/kernels/quantized/cpu/op_embedding4b.cpp
index 79dc3d61367..df553d25949 100644
--- a/kernels/quantized/cpu/op_embedding4b.cpp
+++ b/kernels/quantized/cpu/op_embedding4b.cpp
@@ -256,7 +256,7 @@ Tensor& quantized_embedding_4bit_out(
 }
 
 Tensor& quantized_embedding_4bit_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
     const optional<Tensor>& opt_weight_zero_points,
@@ -316,7 +316,7 @@ Tensor& quantized_embedding_4bit_dtype_out(
 }
 
 Tensor& quantized_embedding_4bit_dtype_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
     const optional<Tensor>& opt_weight_zero_points,
diff --git a/kernels/quantized/cpu/op_mixed_linear.cpp b/kernels/quantized/cpu/op_mixed_linear.cpp
index 7549e0476c9..d3552e1ca69 100644
--- a/kernels/quantized/cpu/op_mixed_linear.cpp
+++ b/kernels/quantized/cpu/op_mixed_linear.cpp
@@ -113,7 +113,7 @@ Tensor& quantized_mixed_linear_out(
 }
 
 Tensor& quantized_mixed_linear_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const Tensor& weight,
     const Tensor& weight_scales,
diff --git a/kernels/quantized/cpu/op_mixed_mm.cpp b/kernels/quantized/cpu/op_mixed_mm.cpp
index c11d6cde5e9..895c7e0af33 100644
--- a/kernels/quantized/cpu/op_mixed_mm.cpp
+++ b/kernels/quantized/cpu/op_mixed_mm.cpp
@@ -88,7 +88,7 @@ Tensor& quantized_mixed_mm_out(
 }
 
 Tensor& quantized_mixed_mm_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     const Tensor& weight,
     const Tensor& weight_scales,
diff --git a/kernels/quantized/cpu/op_quantize.cpp b/kernels/quantized/cpu/op_quantize.cpp
index 080ab56b910..065dc743d92 100644
--- a/kernels/quantized/cpu/op_quantize.cpp
+++ b/kernels/quantized/cpu/op_quantize.cpp
@@ -157,7 +157,7 @@ Tensor& quantize_per_tensor_out(
 }
 
 Tensor& quantize_per_tensor_tensor_args_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& scale,
     const Tensor& zero_point,
@@ -209,7 +209,7 @@ Tensor& quantize_per_tensor_tensor_args_out(
     int64_t quant_max,
     ScalarType dtype,
     Tensor& out) {
-  auto context = torch::executor::RuntimeContext();
+  auto context = executorch::runtime::KernelRuntimeContext();
   auto& res = quantize_per_tensor_tensor_args_out(
       context, input, scale, zero_point, quant_min, quant_max, dtype, out);
   ET_CHECK(context.failure_state() == Error::Ok);
@@ -217,7 +217,7 @@ Tensor& quantize_per_tensor_tensor_args_out(
 }
 
 Tensor& quantize_per_tensor_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& input,
     double scale,
     int64_t zero_point,
@@ -358,7 +358,7 @@ Tensor& quantize_per_channel_out(
 }
 
 Tensor& quantize_per_channel_out(
-    RuntimeContext& context,
+    KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& scale,
     const Tensor& zero_point,
diff --git a/kernels/quantized/cpu/targets.bzl b/kernels/quantized/cpu/targets.bzl
index 39552aaaf10..d2bbbfebe04 100644
--- a/kernels/quantized/cpu/targets.bzl
+++ b/kernels/quantized/cpu/targets.bzl
@@ -58,7 +58,10 @@ def define_common_targets():
     runtime.cxx_library(
         name = "quantized_cpu",
         srcs = [],
-        visibility = ["//executorch/kernels/quantized/..."],
+        visibility = [
+            "//executorch/kernels/quantized/...",
+            "//executorch/extension/pybindings/test/...",
+        ],
         exported_deps = quant_op_targets,
     )
 
diff --git a/kernels/quantized/test/op_add_test.cpp b/kernels/quantized/test/op_add_test.cpp
index a48ba10c66e..573d9b1dca5 100644
--- a/kernels/quantized/test/op_add_test.cpp
+++ b/kernels/quantized/test/op_add_test.cpp
@@ -21,10 +21,10 @@
 using namespace ::testing;
 using exec_aten::ArrayRef;
 using exec_aten::optional;
-using exec_aten::RuntimeContext;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
 using torch::executor::native::add_out;
 using torch::executor::native::dequantize_per_tensor_out;
 using torch::executor::native::quantize_per_tensor_out;
@@ -193,7 +193,7 @@ TEST(OpQuantizeAddTest, ConsitencyWithReferencePattern) {
 
   optional<ScalarType> out_dtype = optional<ScalarType>();
 
-  RuntimeContext context{};
+  KernelRuntimeContext context{};
   // q -> qadd -> dq
   // 3.5 / 0.5 + 1 = 8
   quantize_per_tensor_out(
diff --git a/kernels/quantized/test/op_embedding4b_test.cpp b/kernels/quantized/test/op_embedding4b_test.cpp
index 1eb7aa11b2a..4125e557d4b 100644
--- a/kernels/quantized/test/op_embedding4b_test.cpp
+++ b/kernels/quantized/test/op_embedding4b_test.cpp
@@ -19,9 +19,9 @@
 using namespace ::testing;
 using exec_aten::ArrayRef;
 using exec_aten::optional;
-using exec_aten::RuntimeContext;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
 using torch::executor::native::quantized_embedding_4bit_out;
 
 using torch::executor::testing::TensorFactory;
@@ -62,7 +62,7 @@ TEST(OpQuantizedEmbedding4bTest, TestGroupWiseQuantizedEmbedding) {
   EXPECT_TENSOR_EQ(out, expected);
 
   out = tf.zeros({3, 4});
-  auto context = RuntimeContext();
+  auto context = KernelRuntimeContext();
   torch::executor::native::quantized_embedding_4bit_out(
       context,
       qweight,
diff --git a/kernels/quantized/test/op_embedding_test.cpp b/kernels/quantized/test/op_embedding_test.cpp
index 76114561e53..cc0abe3cb94 100644
--- a/kernels/quantized/test/op_embedding_test.cpp
+++ b/kernels/quantized/test/op_embedding_test.cpp
@@ -21,10 +21,10 @@
 using namespace ::testing;
 using exec_aten::ArrayRef;
 using exec_aten::optional;
-using exec_aten::RuntimeContext;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
 using torch::executor::native::dequantize_per_tensor_out;
 using torch::executor::native::embedding_out;
 using torch::executor::native::quantize_per_tensor_out;
@@ -120,7 +120,7 @@ TEST(OpQuantizedEmbeddingTest, ConsitencyWithReferencePattern) {
 
   TensorFactory<ScalarType::Byte> tfo;
   Tensor qweight = tfo.zeros({3, 1});
-  RuntimeContext context{};
+  KernelRuntimeContext context{};
   // 3.5 / 0.5 + 1 = 8
   // 5.5 / 0.5 + 1 = 12
   // 1 / 0.5 + 1 = 3
diff --git a/kernels/quantized/test/op_mixed_linear_test.cpp b/kernels/quantized/test/op_mixed_linear_test.cpp
index df141cb1cb3..5728134d983 100644
--- a/kernels/quantized/test/op_mixed_linear_test.cpp
+++ b/kernels/quantized/test/op_mixed_linear_test.cpp
@@ -18,9 +18,9 @@
 
 using namespace ::testing;
 using exec_aten::optional;
-using exec_aten::RuntimeContext;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
 using torch::executor::native::quantized_mixed_linear_out;
 using torch::executor::testing::TensorFactory;
 
@@ -57,7 +57,7 @@ void test_dtype() {
       /*sizes=*/{1, 2},
       /*data=*/{2.3, 3.6});
 
-  RuntimeContext ctx{};
+  KernelRuntimeContext ctx{};
 
   quantized_mixed_linear_out(
       ctx, input, weight, weight_scales, opt_weight_zp, opt_dtype_out, out);
@@ -112,7 +112,7 @@ void test_dtype_partials() {
       {(1.0 * 5 + 1.5 * 3) * 0.2 + 2.0 * 1 * 1,
        (1.0 * 4 + 1.5 * 2) * 0.4 + 2.0 * 1 * 0.5});
 
-  RuntimeContext ctx{};
+  KernelRuntimeContext ctx{};
 
   quantized_mixed_linear_out(
       ctx, input, weight, weight_scales, opt_weight_zp, opt_dtype_out, out);
diff --git a/kernels/quantized/test/op_mixed_mm_test.cpp b/kernels/quantized/test/op_mixed_mm_test.cpp
index 3e181ab5231..0dc71abc7ac 100644
--- a/kernels/quantized/test/op_mixed_mm_test.cpp
+++ b/kernels/quantized/test/op_mixed_mm_test.cpp
@@ -18,9 +18,9 @@
 
 using namespace ::testing;
 using exec_aten::optional;
-using exec_aten::RuntimeContext;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
 using torch::executor::native::quantized_mixed_mm_out;
 using torch::executor::testing::TensorFactory;
 
@@ -55,7 +55,7 @@ void test_dtype() {
       /*sizes=*/{1, 2},
       /*data=*/{3.8, 3.0});
 
-  RuntimeContext ctx{};
+  KernelRuntimeContext ctx{};
 
   quantized_mixed_mm_out(ctx, input, weight, weight_scales, opt_weight_zp, out);
 
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index 92350a2c0db..da40c91dab0 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -31,7 +31,8 @@ foreach(kernel ${_kernels})
   add_custom_command(
     OUTPUT "${_wrapper_path}"
     COMMAND mkdir -p ${_wrapper_dir}
-    COMMAND echo "#include <executorch/kernels/${kernel}/Functions.h>" > "${_wrapper_path}"
+    COMMAND echo "#include <executorch/kernels/${kernel}/Functions.h>" >
+            "${_wrapper_path}"
     WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
     COMMENT "Generating ${_wrapper_path}"
     VERBATIM
@@ -243,11 +244,22 @@ set(_optimized_kernels_test_sources
 )
 
 # We don't have sleef on OSS so we don't have gelu and log_softmax
-list(REMOVE_ITEM _optimized_kernels_test_sources "op_gelu_test.cpp" "op_log_softmax_test.cpp")
+list(REMOVE_ITEM _optimized_kernels_test_sources "op_gelu_test.cpp"
+     "op_log_softmax_test.cpp"
+)
 
 et_cxx_test(
-  optimized_kernels_test SOURCES ${_optimized_kernels_test_sources} EXTRA_LIBS
-  optimized_kernels optimized_ops_lib portable_kernels eigen_blas
+  optimized_kernels_test
+  SOURCES
+  ${_optimized_kernels_test_sources}
+  EXTRA_LIBS
+  cpuinfo
+  extension_threadpool
+  optimized_kernels
+  optimized_ops_lib
+  portable_kernels
+  pthreadpool
+  eigen_blas
 )
 add_dependencies(optimized_kernels_test generate_wrapper)
 target_include_directories(
diff --git a/kernels/test/TestUtil.h b/kernels/test/TestUtil.h
index ed72dbc4128..aa220f5bfd5 100644
--- a/kernels/test/TestUtil.h
+++ b/kernels/test/TestUtil.h
@@ -30,6 +30,22 @@
 #define ET_EXPECT_KERNEL_FAILURE_WITH_MSG(_context, _statement, _matcher) \
   EXPECT_ANY_THROW(_statement)
 
+#define ET_TEST_OP_SUPPORTS_MEMORY_FORMATS(                                  \
+    tf, op, input_contiguous, expected_contiguous, channels_last_support)    \
+  Tensor input_channels_last = tf.channels_last_like(input_contiguous);      \
+  Tensor expected_channel_last = tf.channels_last_like(expected_contiguous); \
+                                                                             \
+  Tensor output_contiguous = tf.zeros_like(expected_contiguous);             \
+  Tensor output_channels_last = tf.channels_last_like(output_contiguous);    \
+                                                                             \
+  Tensor ret = op(input_channels_last, output_channels_last);                \
+  if (channels_last_support) {                                               \
+    EXPECT_TENSOR_EQ(output_channels_last, expected_channel_last);           \
+  } else {                                                                   \
+    EXPECT_TENSOR_NE(output_channels_last, expected_channel_last);           \
+  }                                                                          \
+  EXPECT_TENSOR_EQ(output_channels_last, ret);
+
 #else
 
 #define ET_EXPECT_KERNEL_FAILURE(_context, _statement)              \
@@ -52,6 +68,26 @@
     }                                                                 \
   } while (false)
 
+#define ET_TEST_OP_SUPPORTS_MEMORY_FORMATS(                                  \
+    tf, op, input_contiguous, expected_contiguous, channels_last_support)    \
+  Tensor input_channels_last = tf.channels_last_like(input_contiguous);      \
+  Tensor expected_channel_last = tf.channels_last_like(expected_contiguous); \
+                                                                             \
+  Tensor output_contiguous = tf.zeros_like(expected_contiguous);             \
+  Tensor output_channels_last = tf.channels_last_like(output_contiguous);    \
+                                                                             \
+  Tensor ret = op(input_channels_last, output_channels_last);                \
+  if (channels_last_support) {                                               \
+    EXPECT_TENSOR_EQ(output_channels_last, expected_channel_last);           \
+  } else {                                                                   \
+    EXPECT_TENSOR_NE(output_channels_last, expected_channel_last);           \
+  }                                                                          \
+  EXPECT_TENSOR_EQ(output_channels_last, ret);                               \
+  ET_EXPECT_KERNEL_FAILURE(                                                  \
+      context_, op(input_channels_last, output_contiguous));                 \
+  ET_EXPECT_KERNEL_FAILURE(                                                  \
+      context_, op(input_contiguous, output_channels_last));
+
 #endif // USE_ATEN_LIB
 
 /*
@@ -80,6 +116,6 @@ class OperatorTest : public ::testing::Test {
   }
 
  protected:
-  exec_aten::RuntimeContext context_;
+  executorch::runtime::KernelRuntimeContext context_;
   bool expect_failure_;
 };
diff --git a/kernels/test/custom_kernel_example/op_relu.cpp b/kernels/test/custom_kernel_example/op_relu.cpp
index e86d0ca4e62..e59fbf4bd72 100644
--- a/kernels/test/custom_kernel_example/op_relu.cpp
+++ b/kernels/test/custom_kernel_example/op_relu.cpp
@@ -18,8 +18,8 @@ namespace native {
 
 using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
-using exec_aten::RuntimeContext;
 using executor::Error;
+using executorch::runtime::KernelRuntimeContext;
 
 namespace {
 
@@ -61,7 +61,8 @@ void relu(const Tensor& input, Tensor& output) {
  *
  * relu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  */
-Tensor& my_relu_out(RuntimeContext& context, const Tensor& input, Tensor& out) {
+Tensor&
+my_relu_out(KernelRuntimeContext& context, const Tensor& input, Tensor& out) {
   (void)context;
   resize(out, input.sizes());
   ET_KERNEL_CHECK(
diff --git a/kernels/test/op_abs_test.cpp b/kernels/test/op_abs_test.cpp
index b54cd971567..f596d586d90 100644
--- a/kernels/test/op_abs_test.cpp
+++ b/kernels/test/op_abs_test.cpp
@@ -38,3 +38,28 @@ TEST_F(OpAbsTest, SanityCheck) {
   EXPECT_TENSOR_EQ(out, ret);
   EXPECT_TENSOR_EQ(out, expected);
 }
+
+TEST_F(OpAbsTest, MemoryFormatCheck) {
+  TensorFactory<ScalarType::Float> tf;
+
+  std::vector<int32_t> sizes = {2, 3, 1, 5};
+
+  Tensor input_contiguous =
+      tf.make(sizes, {0.8737,  0.5359,  0.3743,  -0.3040, -0.7800, -0.2306,
+                      -0.7684, -0.5364, 0.3478,  -0.3289, 0.0829,  0.2939,
+                      -0.8211, 0.8572,  -0.0802, 0.9252,  -0.2093, 0.9013,
+                      -0.4197, 0.3987,  -0.5291, -0.5567, 0.2691,  0.7819,
+                      -0.8009, -0.4286, -0.9299, 0.2143,  0.2565,  -0.5701});
+  Tensor expected_contiguous = tf.make(
+      sizes, {0.8737, 0.5359, 0.3743, 0.3040, 0.7800, 0.2306, 0.7684, 0.5364,
+              0.3478, 0.3289, 0.0829, 0.2939, 0.8211, 0.8572, 0.0802, 0.9252,
+              0.2093, 0.9013, 0.4197, 0.3987, 0.5291, 0.5567, 0.2691, 0.7819,
+              0.8009, 0.4286, 0.9299, 0.2143, 0.2565, 0.5701});
+
+  ET_TEST_OP_SUPPORTS_MEMORY_FORMATS(
+      tf,
+      op_abs_out,
+      input_contiguous,
+      expected_contiguous,
+      /*channels_last_support=*/true);
+}
diff --git a/kernels/test/op_add_test.cpp b/kernels/test/op_add_test.cpp
index 33de8153a31..e35a4100c9a 100644
--- a/kernels/test/op_add_test.cpp
+++ b/kernels/test/op_add_test.cpp
@@ -58,6 +58,7 @@ class OpAddOutKernelTest : public OperatorTest {
 
   template <ScalarType DTYPE_A, ScalarType DTYPE_B>
   void test_add_enumerate_out_types() {
+    test_add<DTYPE_A, DTYPE_B, ScalarType::BFloat16>();
     test_add<DTYPE_A, DTYPE_B, ScalarType::Half>();
     test_add<DTYPE_A, DTYPE_B, ScalarType::Float>();
     test_add<DTYPE_A, DTYPE_B, ScalarType::Double>();
@@ -73,7 +74,7 @@ class OpAddOutKernelTest : public OperatorTest {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_add_enumerate_out_types<DTYPE_A, ScalarType::dtype>();
 
-    ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
   }
@@ -82,7 +83,7 @@ class OpAddOutKernelTest : public OperatorTest {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_add_enumerate_b_types<ScalarType::dtype>();
 
-    ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
   }
@@ -99,13 +100,15 @@ class OpAddOutKernelTest : public OperatorTest {
 
     // Add two tensors.
     op_add_out(
-        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
+        tf.make(sizes, /*data=*/{1.25, 2.25, 4.5, 8.875}),
         tf.ones(sizes),
-        /*alpha=*/1.1,
+        /*alpha=*/1.25,
         out);
 
-    // Check that it matches the expected output.
-    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{2.2, 3.3, 5.5, 9.9}));
+    // Check that it matches the expected output. Values selected to
+    // be exactly representable to avoid throwing off half/bfloat16
+    // tests.
+    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{2.5, 3.5, 5.75, 10.125}));
   }
 };
 
@@ -136,6 +139,14 @@ TEST_F(OpAddOutKernelTest, DoubleTensors) {
   test_floating_point_add_out<ScalarType::Double>();
 }
 
+TEST_F(OpAddOutKernelTest, HalfTensors) {
+  test_floating_point_add_out<ScalarType::Half>();
+}
+
+TEST_F(OpAddOutKernelTest, BFloat16Tensors) {
+  test_floating_point_add_out<ScalarType::BFloat16>();
+}
+
 TEST_F(OpAddOutKernelTest, BoolAndIntInputTensor) {
   TensorFactory<ScalarType::Bool> tf;
   TensorFactory<ScalarType::Int> tfi;
@@ -288,6 +299,76 @@ TEST_F(OpAddOutKernelTest, BroadcastSupported) {
   EXPECT_TENSOR_EQ(out, tf.ones({5, 2, 3, 4}));
 }
 
+TEST_F(OpAddOutKernelTest, BroadcastOneElementTensor) {
+  TensorFactory<ScalarType::Float> tf;
+  Tensor x = tf.make({1}, {1.75});
+  Tensor y = tf.make({3, 2}, {-1.5, -1, -0.5, 0, 0.5, 1.5});
+
+  Tensor out = tf.zeros({3, 2});
+
+  Tensor ret = op_add_out(x, y, 1, out);
+
+  Tensor expected = tf.make(
+      {3, 2},
+      {
+          0.25,
+          0.75,
+          1.25,
+          1.75,
+          2.25,
+          3.25,
+      });
+
+  EXPECT_TENSOR_EQ(out, expected);
+
+  out = op_add_out(y, x, 1, out);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpAddOutKernelTest, BroadcastOneElementTensorTypePromotion) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Double> tfDouble;
+  Tensor x = tfDouble.make({1}, {1.75});
+  Tensor y = tf.make({3, 2}, {-1.5, -1, -0.5, 0, 0.5, 1.5});
+
+  Tensor out = tfDouble.zeros({3, 2});
+
+  Tensor ret = op_add_out(x, y, 1, out);
+
+  Tensor expected = tfDouble.make(
+      {3, 2},
+      {
+          0.25,
+          0.75,
+          1.25,
+          1.75,
+          2.25,
+          3.25,
+      });
+
+  EXPECT_TENSOR_EQ(out, expected);
+
+  out = op_add_out(y, x, 1, out);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpAddOutKernelTest, BroadcastOneElementRank0Tensor) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor a = tf.make({1}, {5});
+  Tensor b = tf.make({}, {2});
+
+  Tensor out = tf.zeros({1});
+
+  op_add_out(a, b, 1, out);
+
+  Tensor ret = tf.make({1}, {7});
+  EXPECT_TENSOR_EQ(out, ret);
+
+  op_add_out(b, a, 1, out);
+  EXPECT_TENSOR_EQ(out, ret);
+}
+
 //
 // Death Tests
 //
@@ -355,15 +436,15 @@ TEST_F(OpAddOutKernelTest, BoolOutputWithIntegralInput) {
   ET_EXPECT_KERNEL_FAILURE(context_, op_add_out(a, b, /*alpha=*/1, out));
 }
 
-TEST_F(OpAddOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpAddOutKernelTest, MismatchedNonBroadcastableInputShapesDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // Addends with different shapes.
-  Tensor a = tf.ones(/*sizes=*/{4});
+  Tensor a = tf.ones(/*sizes=*/{4, 2});
   Tensor b = tf.ones(/*sizes=*/{2, 2});
 
   // Destination for the sum; matches the shape of one of the inputs.
-  Tensor out = tf.zeros(/*sizes=*/{4});
+  Tensor out = tf.zeros(/*sizes=*/{8});
 
   // Adding the two mismatched tensors should cause an assertion and kill the
   // test process.
diff --git a/kernels/test/op_atan2_test.cpp b/kernels/test/op_atan2_test.cpp
index 0c5baffd414..2acdeeb9134 100644
--- a/kernels/test/op_atan2_test.cpp
+++ b/kernels/test/op_atan2_test.cpp
@@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_atan2_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::atan2_outf(context, self, other, out);
 }
 
diff --git a/kernels/test/op_cdist_forward_test.cpp b/kernels/test/op_cdist_forward_test.cpp
index 04ccb6d34af..c8c18c36add 100644
--- a/kernels/test/op_cdist_forward_test.cpp
+++ b/kernels/test/op_cdist_forward_test.cpp
@@ -28,7 +28,7 @@ Tensor& op_cdist_forward_out(
     double p,
     optional<int64_t> compute_mode,
     Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::_cdist_forward_outf(
       context, x1, x2, p, compute_mode, out);
 }
diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp
index 0244fd55700..d9d45509084 100644
--- a/kernels/test/op_clamp_test.cpp
+++ b/kernels/test/op_clamp_test.cpp
@@ -260,7 +260,7 @@ class OpClampTensorOutTest : public OperatorTest {
       const optional<Tensor>& min,
       const optional<Tensor>& max,
       Tensor& out) {
-    exec_aten::RuntimeContext context{};
+    executorch::runtime::KernelRuntimeContext context{};
     return torch::executor::aten::clamp_outf(context, self, min, max, out);
   }
 };
diff --git a/kernels/test/op_convolution_backward_test.cpp b/kernels/test/op_convolution_backward_test.cpp
new file mode 100644
index 00000000000..4a4d0f883f4
--- /dev/null
+++ b/kernels/test/op_convolution_backward_test.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using exec_aten::ArrayRef;
+using exec_aten::optional;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using IntArrayRef = exec_aten::ArrayRef<int64_t>;
+using OptIntArrayRef = exec_aten::OptionalArrayRef<int64_t>;
+using torch::executor::testing::TensorFactory;
+
+class OpConvolutionBackwardOutTest : public OperatorTest {
+ protected:
+  std::tuple<Tensor&, Tensor&, Tensor&> op_convolution_backward_out(
+      const Tensor& grad_output,
+      const Tensor& input,
+      const Tensor& weight,
+      const OptIntArrayRef bias_sizes_opt,
+      IntArrayRef stride,
+      IntArrayRef padding,
+      IntArrayRef dilation,
+      bool transposed,
+      IntArrayRef output_padding,
+      int64_t groups,
+      std::array<bool, 3> output_mask_a,
+      Tensor& grad_input,
+      Tensor& grad_weight,
+      Tensor& grad_bias) {
+#ifndef USE_ATEN_LIB
+    ArrayRef<bool> output_mask(output_mask_a.data(), output_mask_a.size());
+#else
+    std::array<bool, 3> output_mask = output_mask_a;
+#endif
+    return torch::executor::aten::convolution_backward_outf(
+        context_,
+        grad_output,
+        input,
+        weight,
+        bias_sizes_opt,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        output_mask,
+        grad_input,
+        grad_weight,
+        grad_bias);
+  }
+};
+
+TEST_F(OpConvolutionBackwardOutTest, SmokeTest) {
+  TensorFactory<ScalarType::Float> tf;
+
+  std::vector<float> grad_output_data = {
+      10, 12, 87, 13, 34, 87, 55, 22, 48, 33, 29, 38, 60, 49, 88, 30,
+      99, 19, 42, 37, 61, 31, 33, 58, 38, 23, 2,  33, 3,  21, 32, 2,
+      30, 72, 10, 67, 92, 19, 11, 16, 65, 37, 60, 74, 4,  19, 45, 37};
+  std::vector<float> input_data = {
+      9,  89, 45, 39, 25, 2,  97, 55, 80, 24, 18, 33, 28, 89, 19, 16, 19, 33,
+      69, 61, 34, 84, 58, 30, 33, 18, 75, 30, 6,  33, 42, 10, 80, 41, 66, 64,
+      47, 51, 67, 62, 58, 10, 97, 71, 24, 44, 84, 34, 33, 54, 8,  73, 90, 15,
+      21, 92, 55, 22, 56, 12, 10, 63, 32, 76, 65, 38, 95, 92, 22, 15, 37, 12,
+      67, 14, 60, 44, 73, 74, 23, 4,  56, 64, 88, 90, 82, 32, 91, 3,  6,  87,
+      55, 95, 7,  14, 24, 69, 52, 44, 14, 37, 75, 52, 37, 40, 25, 54, 4,  15,
+      97, 51, 46, 28, 65, 95, 50, 82, 23, 39, 50, 55, 97, 52, 91, 16, 19, 49,
+      61, 50, 42, 47, 87, 99, 9,  60, 22, 71, 47, 17, 0,  80, 28, 88, 93, 43,
+      65, 25, 88, 67, 21, 89, 24, 81, 3,  71, 20, 34, 17, 17, 94, 10, 82, 25,
+      10, 11, 7,  28, 77, 39, 74, 79, 17, 40, 67, 54, 49, 54, 21, 89, 17, 7,
+      52, 64, 68, 80, 7,  72, 44, 35, 92, 47, 4,  13, 10, 43, 64, 66, 83, 49,
+      81, 78, 58, 22, 86, 48, 35, 64, 98, 79, 8,  52, 56, 23, 38, 74, 16, 63,
+      51, 70, 44, 28, 43, 13, 51, 85, 42, 29, 64, 26, 54, 91, 9,  96, 41, 56,
+      7,  52, 27, 22, 69, 13, 8,  20, 22, 49, 66, 98, 77, 42, 54, 38, 70, 83,
+      13, 8,  21, 56, 78, 37, 28, 69, 42, 30, 91, 5,  28, 15, 20, 14, 16, 39,
+      95, 66, 4,  72, 52, 35, 54, 93, 87, 77, 3,  49, 82, 70, 84, 3,  73, 99,
+      32, 95, 58, 65, 32, 75, 34, 22, 12, 84, 63, 72, 85, 66, 63, 27, 3,  73,
+      45, 37, 61, 52, 41, 16, 37, 14, 80, 17, 48, 8,  87, 98, 69, 63, 92, 68,
+      42, 63, 5,  22, 66, 91, 74, 11, 17, 45, 45, 33, 40, 85, 26, 75, 73, 81,
+      54, 27, 80, 1,  44, 66, 10, 21, 15, 10, 76, 96, 0,  43, 39, 3,  57, 79,
+      45, 64, 58, 92, 44, 42, 7,  28, 94, 4,  8,  22, 22, 31, 75, 44, 3,  70,
+      83, 72, 87, 12, 20, 55, 84, 31, 50, 34, 25, 49, 29, 71, 57, 97, 25, 82,
+      84, 42, 86, 41, 54, 92, 34, 30, 52, 34, 84, 25, 54, 37, 38, 26, 76, 82,
+      34, 14, 85, 28, 93, 9};
+  std::vector<float> weight_data = {
+      2,  54, 9,  37, 0,  47, 70, 9,  84,  69, 56, 79, 25, 35, 54, 13,
+      65, 46, 38, 28, 74, 27, 66, 61, 20,  60, 62, 58, 15, 44, 75, 55,
+      7,  52, 13, 36, 39, 64, 62, 45, 100, 6,  79, 63, 63, 52, 37, 60,
+      78, 12, 69, 2,  74, 56, 93, 39, 62,  22, 55, 67, 68, 74, 12, 69,
+      15, 73, 28, 70, 86, 20, 90, 49, 52,  26, 58, 2,  82, 17, 70, 55,
+      54, 83, 70, 11, 27, 9,  5,  42, 34,  62, 29, 94, 69, 81, 54, 4};
+  std::vector<float> expected_grad_input_data = {
+      1134,  7578,  686,   2682,  0, 4148,  7136,  2406,  8698, 0,
+      3759,  6003,  2163,  2395,  0, 2929,  5830,  3469,  6955, 0,
+      720,   6201,  495,   2063,  0, 5260,  5989,  3060,  7079, 0,
+      9690,  3423,  3385,  1932,  0, 7644,  8499,  1323,  2613, 0,
+      4334,  6624,  8532,  9719,  0, 5496,  8601,  1157,  2215, 0,
+      4676,  7600,  6524,  10069, 0, 4047,  6117,  1612,  2567, 0,
+      5931,  5651,  5669,  6623,  0, 7674,  3291,  2748,  1654, 0,
+      10455, 4290,  4145,  796,   0, 9835,  5483,  11649, 5952, 0,
+      7098,  5460,  3101,  2443,  0, 7788,  5909,  8582,  6298, 0,
+      9462,  4845,  3041,  2067,  0, 7038,  6336,  10438, 6377, 0,
+      7518,  8187,  2079,  2773,  0, 10036, 2642,  3952,  1166, 0,
+      16014, 2250,  10025, 1908,  0, 9610,  298,   3868,  122,  0,
+      16629, 4338,  11335, 3527,  0, 11514, 5965,  4762,  2207, 0,
+      18552, 10755, 13309, 5996,  0, 12454, 6787,  4960,  2875, 0,
+      8750,  6999,  3534,  3233,  0, 14160, 9399,  9595,  8922, 0,
+      9110,  6567,  3820,  2351,  0, 12969, 11814, 9436,  5870, 0,
+      7631,  7061,  2877,  2499,  0, 8553,  13527, 3631,  6863, 0,
+      1361,  8634,  515,   3372,  0, 3394,  10206, 1504,  4112, 0,
+      5505,  17421, 4702,  11891, 0, 4233,  11894, 1739,  5014, 0,
+      11787, 14634, 8981,  10759, 0, 11777, 6701,  4719,  3111, 0,
+      18459, 7761,  12044, 7627,  0, 11214, 4556,  4374,  1594, 0,
+      604,   1908,  1506,  6102,  0, 2532,  4024,  1713,  6121, 0,
+      1878,  1814,  4761,  5397,  0, 1127,  3885,  4373,  5832, 0,
+      450,   1414,  1080,  4719,  0, 5210,  2683,  2765,  4252, 0,
+      2390,  1668,  7710,  4257,  0, 378,   1698,  3276,  6021, 0,
+      2866,  4881,  3547,  6822,  0, 502,   1238,  2784,  5199, 0,
+      2496,  3975,  2700,  5004,  0, 1220,  1990,  3633,  5763, 0,
+      4501,  2679,  4504,  5412,  0, 1968,  1376,  6246,  3669, 0,
+      3130,  272,   9345,  1950,  0, 5167,  3278,  9097,  2138, 0,
+      2446,  1946,  6942,  5460,  0, 5732,  3404,  7919,  5534, 0,
+      2038,  1614,  6978,  4635,  0, 4544,  4839,  7367,  5574, 0,
+      1242,  1922,  4842,  6333,  0, 1066,  236,   2236,  686,  0,
+      17238, 2254,  10413, 1592,  0, 991,   30,    2206,  70,   0,
+      18823, 6392,  12173, 2470,  0, 1142,  684,   2742,  1219, 0,
+      21256, 11293, 12719, 7512,  0, 1303,  649,   2818,  1669, 0,
+      898,   574,   2018,  1929,  0, 15720, 11989, 10517, 5972, 0,
+      885,   781,   2210,  1281,  0, 14601, 12198, 7915,  4958, 0,
+      856,   850,   1601,  1355,  0, 7039,  14083, 4113,  7490, 0,
+      152,   927,   287,   1902,  0, 301,   1051,  886,   2346, 0,
+      6821,  19615, 4491,  13281, 0, 424,   1146,  999,   2906, 0,
+      15177, 15480, 8849,  12442, 0, 1222,  544,   2687,  1859, 0,
+      20215, 9693,  11441, 4964,  0, 1206,  555,   2466,  860,  0};
+  std::vector<float> expected_grad_weight_data = {
+      9246,  22073, 12431, 19714, 11179, 19032, 8458,  6495,  18707, 13830,
+      20445, 17089, 17124, 18710, 11827, 17236, 16824, 9008,  14086, 18834,
+      17419, 16759, 13152, 9339,  13801, 20888, 13976, 27277, 13010, 23949,
+      9838,  11220, 17658, 15019, 25337, 17583, 13270, 21754, 16908, 20563,
+      20732, 13413, 20868, 27521, 19537, 21170, 15888, 10034, 19195, 16370,
+      40243, 25890, 40472, 30460, 21228, 21625, 13289, 24435, 19876, 29816,
+      24188, 23619, 13752, 16251, 18741, 19368, 24517, 34261, 27054, 31257,
+      21238, 18909, 15776, 16881, 34604, 22534, 28101, 23834, 18479, 16469,
+      12852, 16551, 14204, 29983, 20167, 24150, 14281, 17501, 15897, 16019,
+      21661, 32765, 23874, 26527, 20463, 18661};
+  std::vector<float> expected_grad_bias_data = {363, 438, 585, 501};
+
+  auto grad_output = tf.make({2, 4, 3, 2}, grad_output_data);
+  auto input = tf.make({2, 6, 7, 5}, input_data);
+  auto weight = tf.make({4, 3, 4, 2}, weight_data);
+  int64_t bias_sizes[1] = {4};
+  int64_t stride[2] = {1, 2};
+  int64_t padding[2] = {1, 0};
+  int64_t dilation[2] = {2, 1};
+  bool transposed = false;
+  int64_t output_padding[2] = {0, 0};
+  int64_t groups = 2;
+  std::array<bool, 3> output_mask_a = {true, true, true};
+  auto grad_input = tf.zeros({2, 6, 7, 5});
+  auto grad_weight = tf.zeros({4, 3, 4, 2});
+  auto grad_bias = tf.zeros({4});
+
+  op_convolution_backward_out(
+      grad_output,
+      input,
+      weight,
+      IntArrayRef{bias_sizes, 1},
+      IntArrayRef{stride, 2},
+      IntArrayRef{padding, 2},
+      IntArrayRef{dilation, 2},
+      transposed,
+      IntArrayRef{output_padding, 2},
+      groups,
+      output_mask_a,
+      grad_input,
+      grad_weight,
+      grad_bias);
+
+  auto expected_grad_input = tf.make({2, 6, 7, 5}, expected_grad_input_data);
+  auto expected_grad_weight = tf.make({4, 3, 4, 2}, expected_grad_weight_data);
+  auto expected_grad_bias = tf.make({4}, expected_grad_bias_data);
+
+  EXPECT_TENSOR_CLOSE(grad_input, expected_grad_input);
+  EXPECT_TENSOR_CLOSE(grad_weight, expected_grad_weight);
+  EXPECT_TENSOR_CLOSE(grad_bias, expected_grad_bias);
+}
diff --git a/kernels/test/op_copy_test.cpp b/kernels/test/op_copy_test.cpp
index 82332f85eb2..007b10a7636 100644
--- a/kernels/test/op_copy_test.cpp
+++ b/kernels/test/op_copy_test.cpp
@@ -125,13 +125,13 @@ class OpCopyInplaceTest : public OperatorTest {
 // regular test for copy.out
 TEST_F(OpCopyTest, AllRealDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
 TEST_F(OpCopyTest, EmptyInputSupported) {
 #define TEST_ENTRY(ctype, dtype) test_empty_input<ctype, ScalarType::dtype>();
-  ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
diff --git a/kernels/test/op_diagonal_copy_test.cpp b/kernels/test/op_diagonal_copy_test.cpp
index 71f2839db77..5ad69066532 100644
--- a/kernels/test/op_diagonal_copy_test.cpp
+++ b/kernels/test/op_diagonal_copy_test.cpp
@@ -27,7 +27,7 @@ Tensor& op_diagonal_copy_out(
     int64_t dim1,
     int64_t dim2,
     Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::diagonal_copy_outf(
       context, input, offset, dim1, dim2, out);
 }
diff --git a/kernels/test/op_div_test.cpp b/kernels/test/op_div_test.cpp
index bb302b63140..df7fdaf2ec9 100644
--- a/kernels/test/op_div_test.cpp
+++ b/kernels/test/op_div_test.cpp
@@ -229,6 +229,31 @@ TEST_F(OpDivOutTest, BroadcastScalarSupported2) {
 
   Tensor ret = tf.make({3, 1, 1}, {4, 2, 1});
   EXPECT_TENSOR_EQ(out, ret);
+
+  std::swap(a, b);
+  out = tf.zeros({3, 1, 1});
+  op_div_out(a, b, out);
+  ret = tf.make({3, 1, 1}, {0.25, 0.5, 1});
+  EXPECT_TENSOR_EQ(out, ret);
+}
+
+TEST_F(OpDivOutTest, BroadcastScalarRank0Supported) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor a = tf.make({1}, {8});
+  Tensor b = tf.make({}, {2});
+
+  Tensor out = tf.zeros({1});
+
+  op_div_out(a, b, out);
+
+  Tensor ret = tf.make({1}, {4});
+  EXPECT_TENSOR_EQ(out, ret);
+
+  op_div_out(b, a, out);
+
+  ret = tf.make({1}, {0.25});
+  EXPECT_TENSOR_EQ(out, ret);
 }
 
 TEST_F(OpDivOutTest, BroadcastDimSizeIsOneAB) {
diff --git a/kernels/test/op_expm1_test.cpp b/kernels/test/op_expm1_test.cpp
index 0538e7b6b2e..c0d3a226309 100644
--- a/kernels/test/op_expm1_test.cpp
+++ b/kernels/test/op_expm1_test.cpp
@@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_expm1_out(const Tensor& a, Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::expm1_outf(context, a, out);
 }
 
diff --git a/kernels/test/op_flip_test.cpp b/kernels/test/op_flip_test.cpp
index 36d85d8a1fd..01c79a283e9 100644
--- a/kernels/test/op_flip_test.cpp
+++ b/kernels/test/op_flip_test.cpp
@@ -22,7 +22,7 @@ using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_flip_out(const Tensor& input, IntArrayRef dims, Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::flip_outf(context, input, dims, out);
 }
 
diff --git a/kernels/test/op_full_test.cpp b/kernels/test/op_full_test.cpp
index 82aaea03337..09885ddd991 100644
--- a/kernels/test/op_full_test.cpp
+++ b/kernels/test/op_full_test.cpp
@@ -38,12 +38,25 @@ class OpFullOutTest : public OperatorTest {
     std::vector<int64_t> size_int64_t(size_int32_t.begin(), size_int32_t.end());
     auto aref = IntArrayRef(size_int64_t.data(), size_int64_t.size());
 
+    // Boolean Scalar
     // Before: `out` consists of 0s.
     Tensor out = tf.zeros(size_int32_t);
+    // After: `out` consists of 1s.
+    op_full_out(aref, true, out);
+    EXPECT_TENSOR_EQ(out, tf.ones(size_int32_t));
 
+    // Integral Scalar
+    // Before: `out` consists of 0s.
+    out = tf.zeros(size_int32_t);
     // After: `out` consists of 1s.
     op_full_out(aref, 1, out);
+    EXPECT_TENSOR_EQ(out, tf.ones(size_int32_t));
 
+    // Floating Point Scalar
+    // Before: `out` consists of 0s.
+    out = tf.zeros(size_int32_t);
+    // After: `out` consists of 1s.
+    op_full_out(aref, 1.0, out);
     EXPECT_TENSOR_EQ(out, tf.ones(size_int32_t));
   }
 };
@@ -57,4 +70,55 @@ class OpFullOutTest : public OperatorTest {
     test_ones_out<ScalarType::DTYPE>({2, 3, 4}); \
   }
 
-ET_FORALL_REAL_TYPES(GENERATE_TEST)
+ET_FORALL_REALH_TYPES(GENERATE_TEST)
+
+TEST_F(OpFullOutTest, ValueOverflow) {
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen kernel doesn't handle overflow";
+  }
+  TensorFactory<ScalarType::Byte> tf;
+
+  std::vector<int64_t> sizes_int64_t_vec = {2, 3};
+  std::vector<int32_t> sizes_in32_t_vec = {2, 3};
+  auto sizes = IntArrayRef(sizes_int64_t_vec.data(), sizes_int64_t_vec.size());
+
+  Tensor out = tf.zeros(sizes_in32_t_vec);
+
+  op_full_out(sizes, 1000, out);
+}
+
+TEST_F(OpFullOutTest, HalfSupport) {
+  TensorFactory<ScalarType::Half> tf;
+
+  std::vector<int64_t> sizes_int64_t_vec = {2, 3};
+  std::vector<int32_t> sizes_in32_t_vec = {2, 3};
+  auto sizes = IntArrayRef(sizes_int64_t_vec.data(), sizes_int64_t_vec.size());
+
+  // Boolean Scalar
+  Tensor out = tf.zeros(sizes_in32_t_vec);
+  op_full_out(sizes, true, out);
+  EXPECT_TENSOR_EQ(out, tf.ones(sizes_in32_t_vec));
+
+  // Integral Scalar
+  out = tf.zeros(sizes_in32_t_vec);
+  op_full_out(sizes, 1, out);
+  EXPECT_TENSOR_EQ(out, tf.ones(sizes_in32_t_vec));
+
+  // Floating Point Scalar
+  out = tf.zeros(sizes_in32_t_vec);
+  op_full_out(sizes, 3.1415926535, out);
+  EXPECT_TENSOR_EQ(out, tf.full(sizes_in32_t_vec, 3.1415926535));
+}
+
+TEST_F(OpFullOutTest, ZeroDim) {
+  TensorFactory<ScalarType::Half> tf;
+
+  std::vector<int64_t> sizes_int64_t_vec = {};
+  std::vector<int32_t> sizes_in32_t_vec = {};
+  auto sizes = IntArrayRef(sizes_int64_t_vec.data(), sizes_int64_t_vec.size());
+
+  // Boolean Scalar
+  Tensor out = tf.zeros(sizes_in32_t_vec);
+  op_full_out(sizes, true, out);
+  EXPECT_TENSOR_EQ(out, tf.ones(sizes_in32_t_vec));
+}
diff --git a/kernels/test/op_gather_test.cpp b/kernels/test/op_gather_test.cpp
new file mode 100644
index 00000000000..9d637560eda
--- /dev/null
+++ b/kernels/test/op_gather_test.cpp
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/kernels/test/supported_features.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+#include <cmath>
+
+using namespace ::testing;
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+class OpGatherOutTest : public OperatorTest {
+ protected:
+  Tensor& op_gather_out(
+      const Tensor& self,
+      int64_t dim,
+      const Tensor& index,
+      bool sparse_grad,
+      Tensor& out) {
+    return torch::executor::aten::gather_outf(
+        context_, self, dim, index, sparse_grad, out);
+  }
+
+  // Common testing for the operator
+  template <ScalarType DATA_DTYPE>
+  void test_gather_out() {
+    TensorFactory<ScalarType::Long> tf_index;
+    TensorFactory<DATA_DTYPE> tf_data;
+    const std::vector<int32_t> sizes = {2, 3};
+    // clang-format off
+    Tensor self = tf_data.make(
+      /*sizes=*/{2, 5},
+      {
+        1, 2, 3, 4, 5,
+        6, 7, 8, 9, 10
+      });
+    // clang-format on
+    Tensor out = tf_data.zeros(sizes);
+    // clang-format off
+    bool sparse_grad = false;
+    Tensor index = tf_index.make(sizes,
+      {
+        0, 1, 0,
+        1, 0, 1,
+      });
+    // clang-format on
+
+    // Valid input should give the expected output
+    op_gather_out(self, 0, index, sparse_grad, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out, tf_data.make(
+          sizes,
+          {
+            1, 7, 3,
+            6, 2, 8,
+          }));
+    // clang-format on
+
+    // Valid input should give the expected output
+    op_gather_out(self, 1, index, sparse_grad, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out, tf_data.make(sizes,
+        {
+          1, 2, 1,
+          7, 6, 7,
+        }));
+
+    self = tf_data.make(
+        /*sizes=*/{2, 3, 3},
+        {
+          // [0, :, :]
+          1,  2,  3,
+          4,  5,  6,
+          7,  8,  9,
+
+          // [1, :, :]
+          10, 11, 12,
+          13, 14, 15,
+          16, 17, 18
+        });
+    index = tf_index.make(
+      /*sizes=*/{1, 3, 2},
+      {
+        0, 1,
+        1, 2,
+        0, 2
+      });
+    // clang-format on
+    out = tf_data.zeros(/*sizes=*/{1, 3, 2});
+
+    op_gather_out(self, 1, index, sparse_grad, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out,
+        tf_data.make(
+            /*sizes=*/{1, 3, 2},
+            {
+              1, 5,
+              4, 8,
+              1, 8,
+            }));
+    // clang-format on
+
+    out = tf_data.zeros(/*sizes=*/{1, 3, 2});
+    op_gather_out(self, 2, index, sparse_grad, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out,
+        tf_data.make(
+            /*sizes=*/{1, 3, 2},
+            {
+              1, 2,
+              5, 6,
+              7, 9,
+            }));
+    // clang-format on
+  }
+
+  // Invalid dimensions
+  template <ScalarType DATA_DTYPE>
+  void test_gather_out_invalid_dim() {
+    TensorFactory<ScalarType::Long> tf_index;
+    TensorFactory<DATA_DTYPE> tf_data;
+    // clang-format off
+    Tensor self = tf_data.make(/*sizes=*/{2, 5},
+      {
+        1, 2, 3, 4, 5,
+        6, 7, 8, 9, 10
+      });
+    const std::vector<int32_t> sizes = {2, 3};
+    Tensor index = tf_index.make(sizes,
+      {
+        0, 1, 0,
+        1, 0, 1,
+      });
+    // clang-format on
+    bool sparse_grad = false;
+    Tensor out = tf_data.zeros(sizes);
+
+    // Invalid dim should die
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_gather_out(self, -3, index, sparse_grad, out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_gather_out(self, 2, index, sparse_grad, out));
+
+    // Self and index hsould have same number of dimensions
+    index = tf_index.zeros(/*sizes=*/{2, 2, 2});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_gather_out(self, 0, index, sparse_grad, out));
+
+    // Size of dimension of index should be smaller than the size of that
+    // dimension of self if dimension != dim
+    index = tf_index.zeros(/*sizes=*/{3, 5});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_gather_out(self, 1, index, sparse_grad, out));
+
+    // Index out of bound for self in dim
+    index = tf_index.make(/*sizes=*/{2, 3}, {0, 1, 2, 0, 1, 2});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_gather_out(self, 0, index, sparse_grad, out));
+  }
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    TensorFactory<ScalarType::Int> tf;
+    TensorFactory<ScalarType::Long> tf_index;
+
+    Tensor input = tf.ones({2, 3, 4});
+    Tensor index = tf_index.zeros({2, 3, 4});
+    bool sparse_grad = false;
+    Tensor expected = tf.ones({2, 3, 4});
+    Tensor out = tf.zeros(out_shape, dynamism);
+
+    op_gather_out(input, 2, index, sparse_grad, out);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
+
+TEST_F(OpGatherOutTest, AllValidInputOutputSupport) {
+#define TEST_ENTRY(CTYPE, DTYPE) test_gather_out<ScalarType::DTYPE>();
+  ET_FORALL_REAL_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpGatherOutTest, InfinityAndNANTest) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+  // clang-format off
+  Tensor self = tf_data.make(
+      /*sizes=*/{2, 5},
+      {
+        INFINITY, -INFINITY, NAN,       2.33, 3.14,
+        NAN,      INFINITY,  -INFINITY, 3.14, 2.33
+      });
+  // clang-format on
+  const std::vector<int32_t> sizes = {2, 3};
+  Tensor index = tf_index.make(sizes, {0, 1, 0, 1, 0, 1});
+  bool sparse_grad = false;
+  Tensor out = tf_data.zeros(sizes);
+
+  // Valid input should give the expected output
+  op_gather_out(self, 0, index, sparse_grad, out);
+  // clang-format off
+  EXPECT_TENSOR_CLOSE(
+      out,
+      tf_data.make(sizes,
+      {
+        INFINITY, INFINITY, NAN,
+        NAN, -INFINITY, -INFINITY,
+      }));
+  // clang-format on
+}
+
+TEST_F(OpGatherOutTest, InvalidDimensionsDies) {
+#define TEST_ENTRY(CTYPE, DTYPE) \
+  test_gather_out_invalid_dim<ScalarType::DTYPE>();
+  ET_FORALL_REAL_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpGatherOutTest, MismatchedInputDtypesDies) {
+  TensorFactory<ScalarType::Byte> tf_byte;
+  TensorFactory<ScalarType::Char> tf_char;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  Tensor self = tf_char.make({2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  const std::vector<int32_t> sizes = {2, 3};
+  Tensor index = tf_byte.make(sizes, {0, 1, 0, 0, 1, 0});
+  bool sparse_grad = false;
+  Tensor out = tf_char.zeros(sizes);
+
+  // Types other than long for index should die
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_gather_out(self, 0, index, sparse_grad, out));
+
+  // Mismatched dtype of self and out should die
+  self = tf_byte.make(/*sizes=*/{2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  index = tf_long.make(sizes, {0, 1, 0, 1, 0, 1});
+  out = tf_char.zeros(sizes);
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_gather_out(self, 0, index, sparse_grad, out));
+}
+
+TEST_F(OpGatherOutTest, DynamicShapeUpperBoundSameAsExpected) {
+  test_dynamic_shape(
+      {2, 3, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+}
+
+TEST_F(OpGatherOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+  test_dynamic_shape(
+      {10, 10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+}
+
+TEST_F(OpGatherOutTest, DynamicShapeUnbound) {
+  if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
+    GTEST_SKIP() << "Dynamic shape not supported";
+  }
+  test_dynamic_shape(
+      {1, 1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
+}
+
+TEST_F(OpGatherOutTest, EmptyIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.ones({2, 5});
+  const std::vector<int32_t> sizes = {2, 0, 3};
+  Tensor index = tf_index.zeros(sizes);
+  bool sparse_grad = false;
+  Tensor out = tf_data.zeros(sizes);
+  op_gather_out(self, 0, index, sparse_grad, out);
+  EXPECT_TENSOR_CLOSE(out, tf_data.zeros(sizes));
+}
+
+TEST_F(OpGatherOutTest, ValidZeroDim) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({}, {3.14});
+  Tensor index = tf_index.zeros({});
+  bool sparse_grad = false;
+  Tensor out = tf_data.zeros({});
+  op_gather_out(self, 0, index, sparse_grad, out);
+  EXPECT_TENSOR_CLOSE(out, tf_data.make({}, {3.14}));
+}
+
+TEST_F(OpGatherOutTest, InvalidZeroDimInput) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.ones({});
+  const std::vector<int32_t> sizes = {2, 3};
+  Tensor index = tf_index.make(sizes, {0, 0, 0, 0, 0, 0});
+  bool sparse_grad = false;
+  Tensor out = tf_data.zeros(sizes);
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_gather_out(self, 0, index, sparse_grad, out));
+}
+
+TEST_F(OpGatherOutTest, InvalidZeroDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({2, 3}, {1, 2, 3, 4, 5, 6});
+  const std::vector<int32_t> sizes = {};
+  Tensor index = tf_index.make(sizes, {2});
+  bool sparse_grad = false;
+  Tensor out = tf_data.zeros(sizes);
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_gather_out(self, 1, index, sparse_grad, out));
+}
+
+TEST_F(OpGatherOutTest, ValidZeroDimInputAndOneDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({}, {3.14});
+  const std::vector<int32_t> sizes = {3};
+  Tensor index = tf_index.make(sizes, {0, 0, 0});
+  bool sparse_grad = false;
+  Tensor out = tf_data.make({3}, {2.71, 2.71, 2.71});
+  op_gather_out(self, 0, index, sparse_grad, out);
+  EXPECT_TENSOR_CLOSE(out, tf_data.make({3}, {3.14, 3.14, 3.14}));
+}
+
+TEST_F(OpGatherOutTest, ValidOneDimInputAndZeroDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({3}, {10, 20, 30});
+  const std::vector<int32_t> sizes = {};
+  Tensor index = tf_index.make(sizes, {2});
+  bool sparse_grad = false;
+  Tensor out = tf_data.make(sizes, {1729});
+  op_gather_out(self, 0, index, sparse_grad, out);
+  EXPECT_TENSOR_CLOSE(out, tf_data.make({}, {30}));
+}
+
+TEST_F(OpGatherOutTest, InvalidZeroDimInputAndOneDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({}, {3.14});
+  const std::vector<int32_t> sizes = {3};
+  Tensor index = tf_index.make(sizes, {10, 100, 1000});
+  bool sparse_grad = false;
+  Tensor out = tf_data.make({3}, {2.71, 2.71, 2.71});
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_gather_out(self, 0, index, sparse_grad, out));
+}
+
+TEST_F(OpGatherOutTest, InvalidOneDimInputAndZeroDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({3}, {10, 20, 30});
+  const std::vector<int32_t> sizes = {};
+  Tensor index = tf_index.make(sizes, {100});
+  bool sparse_grad = false;
+  Tensor out = tf_data.make(sizes, {1729});
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_gather_out(self, 0, index, sparse_grad, out));
+}
diff --git a/kernels/test/op_ge_test.cpp b/kernels/test/op_ge_test.cpp
index 5e7414a735e..21f21dfbfd2 100644
--- a/kernels/test/op_ge_test.cpp
+++ b/kernels/test/op_ge_test.cpp
@@ -15,10 +15,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::RuntimeContext;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
 class OpGeTensorOutTest : public OperatorTest {
diff --git a/kernels/test/op_gt_test.cpp b/kernels/test/op_gt_test.cpp
index ae94e2109f0..140c08ae274 100644
--- a/kernels/test/op_gt_test.cpp
+++ b/kernels/test/op_gt_test.cpp
@@ -15,10 +15,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::RuntimeContext;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
 class OpGtScalarOutTest : public OperatorTest {
diff --git a/kernels/test/op_index_put_test.cpp b/kernels/test/op_index_put_test.cpp
index b685edc6aaf..868c11600f4 100644
--- a/kernels/test/op_index_put_test.cpp
+++ b/kernels/test/op_index_put_test.cpp
@@ -707,7 +707,7 @@ TEST_F(OpIndexPutOutTest, AllDtypesSupportedForInput) {
 #define TEST_ENTRY(ctype, dtype) \
   test_dtype<ScalarType::dtype, ScalarType::Long>();
 
-  ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
+  ET_FORALL_REALHBBF16_TYPES(TEST_ENTRY);
 
 #undef TEST_ENTRY
 }
diff --git a/kernels/test/op_index_test.cpp b/kernels/test/op_index_test.cpp
index 03a91005e83..35bd6a28da5 100644
--- a/kernels/test/op_index_test.cpp
+++ b/kernels/test/op_index_test.cpp
@@ -107,7 +107,7 @@ class OpIndexTensorOutTest : public OperatorTest {
 #define TEST_ENTRY(ctype, dtype) \
   test_dtype<ScalarType::dtype, ScalarType::Long, ScalarType::dtype>();
 
-    ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
+    ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 
 #undef TEST_ENTRY
   }
diff --git a/kernels/test/op_le_test.cpp b/kernels/test/op_le_test.cpp
index afbe42ab7ec..ab437327ba2 100644
--- a/kernels/test/op_le_test.cpp
+++ b/kernels/test/op_le_test.cpp
@@ -15,10 +15,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::RuntimeContext;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
 class OpLeScalarOutTest : public OperatorTest {
diff --git a/kernels/test/op_linear_test.cpp b/kernels/test/op_linear_test.cpp
new file mode 100644
index 00000000000..47f8925af08
--- /dev/null
+++ b/kernels/test/op_linear_test.cpp
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/kernels/test/supported_features.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <gtest/gtest.h>
+#include <limits>
+
+using namespace ::testing;
+using exec_aten::ArrayRef;
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+class OpLinearOutTest : public OperatorTest {
+ protected:
+  Tensor& op_linear_out(const Tensor& self, const Tensor& mat2, Tensor& out) {
+    return torch::executor::aten::linear_outf(context_, self, mat2, {}, out);
+  }
+
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+      if (DTYPE == ScalarType::Half) {
+        GTEST_SKIP()
+            << "skip Half because torch::executor::aten::mm_out does not support Half";
+        return;
+      }
+    }
+
+    // matmul gives 32 * 2 * 3 = 192
+    Tensor x = tf.full({3, 32}, 2);
+    Tensor y = tf.full({5, 32}, 3);
+
+    // Output shape should be (3, 5)
+    Tensor out = tf.zeros({3, 5});
+
+    op_linear_out(x, y, out);
+
+    Tensor expected = tf.full({3, 5}, 192);
+
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
+
+TEST_F(OpLinearOutTest, OutputDim) {
+  TensorFactory<ScalarType::Int> tf;
+
+  // 3 tensors with compatible dimensions: (3, 5), (3, 4) and (4, 5).
+  Tensor x = tf.ones({3, 4});
+  Tensor y = tf.ones({5, 4});
+  Tensor out = tf.zeros({3, 5});
+
+  Tensor ret = op_linear_out(x, y, out);
+
+  // Should always return the provided out Tensor.
+  EXPECT_TENSOR_EQ(ret, out);
+
+  // Expected tensor, filled with 4.
+  Tensor expected = tf.full({3, 5}, 4);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+/// A generic smoke test that works for any dtype that supports ones() and
+/// zeros().
+TEST_F(OpLinearOutTest, AllDtypesSupported) {
+#define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+  // TODO: Also add tests for half, complex, quantized, and other types. Easiest
+  // way to do that would be to make TensorFactory support zeros() and ones()
+  // for those types.
+}
+
+TEST_F(OpLinearOutTest, EmptyInputWithEmptyOutTensorPasses) {
+  TensorFactory<ScalarType::Float> tf;
+
+  // Empty input matrices
+  Tensor x = tf.make({0, 3}, {});
+  Tensor y = tf.make({0, 3}, {});
+
+  // Output matrix is also empty
+  Tensor out = tf.make({0, 0}, {});
+
+  Tensor expected = tf.make({0, 0}, {});
+
+  EXPECT_TENSOR_EQ(op_linear_out(x, y, out), expected);
+}
+
+TEST_F(OpLinearOutTest, InfinityTensorPasses) {
+  TensorFactory<ScalarType::Float> tff;
+
+  Tensor x = tff.full({3, 4}, std::numeric_limits<float>::infinity());
+  Tensor y = tff.full({5, 4}, 3);
+
+  // Output shape should be (3, 5)
+  Tensor out = tff.zeros({3, 5});
+
+  Tensor expected = tff.full({3, 5}, std::numeric_limits<float>::infinity());
+
+  EXPECT_TENSOR_EQ(op_linear_out(x, y, out), expected);
+}
+
+TEST_F(OpLinearOutTest, MismatchedDimensionsDies) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor x = tf.full({2, 2}, 3);
+
+  Tensor wrong_y = tf.full({1, 3}, 1);
+  Tensor right_y = tf.full({2, 2}, 1);
+
+  // Make an empty out tensor and demonstrate that it's empty.
+  Tensor out = tf.full({2, 2}, 0);
+
+  Tensor expected = tf.full({2, 2}, 6);
+  ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, wrong_y, out));
+
+  EXPECT_TENSOR_EQ(op_linear_out(x, right_y, out), expected);
+}
+
+TEST_F(OpLinearOutTest, MismatchedDimensionSizeDies) {
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen kernel can handle mismatched dimension size";
+  }
+  TensorFactory<ScalarType::Int> tf;
+  Tensor x = tf.full({2, 2}, 3);
+
+  // wrong_y has incompatible dim
+  Tensor wrong_y = tf.full({2, 2, 2}, 1);
+  Tensor right_y = tf.full({2, 2}, 1);
+
+  // wrong_out has incompatible dim
+  Tensor right_out = tf.ones({2, 2});
+  Tensor wrong_out = tf.ones({2, 2, 3});
+
+  ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, right_y, wrong_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, wrong_y, right_out));
+}
+
+TEST_F(OpLinearOutTest, WrongOutShapeDies) {
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen kernel can handle wrong out shape";
+  }
+  TensorFactory<ScalarType::Int> tf;
+  Tensor x = tf.ones({10, 3});
+
+  Tensor y = tf.ones({4, 3});
+
+  // wrong_out has incompatible shape
+  Tensor right_out = tf.ones({10, 4});
+  Tensor wrong_out = tf.ones({7, 5});
+
+  ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, y, wrong_out));
+
+  EXPECT_TENSOR_EQ(op_linear_out(x, y, right_out), tf.full({10, 4}, 3));
+}
+
+TEST_F(OpLinearOutTest, DynamicShapeUpperBoundSameAsExpected) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make(
+      {3, 2},
+      {0.17412060499191284,
+       0.34793388843536377,
+       0.8187907934188843,
+       0.9979893565177917,
+       0.7049332857131958,
+       0.4255824089050293});
+  Tensor y = tf.make(
+      {4, 2},
+      {0.8071839213371277,
+       0.31638312339782715,
+       0.13667285442352295,
+       0.3691965937614441,
+       0.9002121090888977,
+       0.09420186281204224,
+       0.9070476293563843,
+       0.9310881495475769});
+  Tensor expected_result = tf.make(
+      {3, 4},
+      {0.2506277561187744,
+       0.15225356817245483,
+       0.18952149152755737,
+       0.48189279437065125,
+       0.976661741733551,
+       0.480360746383667,
+       0.8310978412628174,
+       1.6718982458114624,
+       0.703657865524292,
+       0.2534688115119934,
+       0.6746801733970642,
+       1.0356627702713013});
+
+  Tensor out =
+      tf.zeros({3, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+  Tensor ret = op_linear_out(x, y, out);
+  EXPECT_TENSOR_CLOSE(out, expected_result);
+}
+
+TEST_F(OpLinearOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make(
+      {3, 2},
+      {0.17412060499191284,
+       0.34793388843536377,
+       0.8187907934188843,
+       0.9979893565177917,
+       0.7049332857131958,
+       0.4255824089050293});
+  Tensor y = tf.make(
+      {4, 2},
+      {0.8071839213371277,
+       0.31638312339782715,
+       0.13667285442352295,
+       0.3691965937614441,
+       0.9002121090888977,
+       0.09420186281204224,
+       0.9070476293563843,
+       0.9310881495475769});
+  Tensor expected_result = tf.make(
+      {3, 4},
+      {0.2506277561187744,
+       0.15225356817245483,
+       0.18952149152755737,
+       0.48189279437065125,
+       0.976661741733551,
+       0.480360746383667,
+       0.8310978412628174,
+       1.6718982458114624,
+       0.703657865524292,
+       0.2534688115119934,
+       0.6746801733970642,
+       1.0356627702713013});
+
+  Tensor out =
+      tf.zeros({10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+  Tensor ret = op_linear_out(x, y, out);
+  EXPECT_TENSOR_CLOSE(out, expected_result);
+}
+
+TEST_F(OpLinearOutTest, DynamicShapeUnbound) {
+  GTEST_SKIP() << "Dynamic shape not supported";
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make(
+      {3, 2},
+      {0.17412060499191284,
+       0.34793388843536377,
+       0.8187907934188843,
+       0.9979893565177917,
+       0.7049332857131958,
+       0.4255824089050293});
+  Tensor y = tf.make(
+      {4, 2},
+      {0.8071839213371277,
+       0.31638312339782715,
+       0.13667285442352295,
+       0.3691965937614441,
+       0.9002121090888977,
+       0.09420186281204224,
+       0.9070476293563843,
+       0.9310881495475769});
+  Tensor expected_result = tf.make(
+      {3, 4},
+      {0.2506277561187744,
+       0.15225356817245483,
+       0.18952149152755737,
+       0.48189279437065125,
+       0.976661741733551,
+       0.480360746383667,
+       0.8310978412628174,
+       1.6718982458114624,
+       0.703657865524292,
+       0.2534688115119934,
+       0.6746801733970642,
+       1.0356627702713013});
+
+  Tensor out =
+      tf.zeros({1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
+  Tensor ret = op_linear_out(x, y, out);
+  EXPECT_TENSOR_CLOSE(out, expected_result);
+}
+
+// TODO: support and test bias
diff --git a/kernels/test/op_log10_test.cpp b/kernels/test/op_log10_test.cpp
index 3a030d9e20b..d4e14100497 100644
--- a/kernels/test/op_log10_test.cpp
+++ b/kernels/test/op_log10_test.cpp
@@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_log10_out(const Tensor& a, Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::log10_outf(context, a, out);
 }
 
diff --git a/kernels/test/op_log1p_test.cpp b/kernels/test/op_log1p_test.cpp
index dd6dc9981d6..3d4b0f1c567 100644
--- a/kernels/test/op_log1p_test.cpp
+++ b/kernels/test/op_log1p_test.cpp
@@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_log1p_out(const Tensor& a, Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::log1p_outf(context, a, out);
 }
 
diff --git a/kernels/test/op_log2_test.cpp b/kernels/test/op_log2_test.cpp
index 17bc0a63880..cbbd8f6a985 100644
--- a/kernels/test/op_log2_test.cpp
+++ b/kernels/test/op_log2_test.cpp
@@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_log2_out(const Tensor& a, Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::log2_outf(context, a, out);
 }
 
diff --git a/kernels/test/op_lt_test.cpp b/kernels/test/op_lt_test.cpp
index 34ff9c93b68..45767bcd0ba 100644
--- a/kernels/test/op_lt_test.cpp
+++ b/kernels/test/op_lt_test.cpp
@@ -15,10 +15,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::RuntimeContext;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
 class OpLtScalarOutTest : public OperatorTest {
diff --git a/kernels/test/op_maximum_test.cpp b/kernels/test/op_maximum_test.cpp
index 92e6121c3fa..254725d634f 100644
--- a/kernels/test/op_maximum_test.cpp
+++ b/kernels/test/op_maximum_test.cpp
@@ -21,7 +21,7 @@ using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_maximum_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::maximum_outf(context, self, other, out);
 }
 
diff --git a/kernels/test/op_mm_test.cpp b/kernels/test/op_mm_test.cpp
index 70d4b5ff0f5..c05792523f2 100644
--- a/kernels/test/op_mm_test.cpp
+++ b/kernels/test/op_mm_test.cpp
@@ -81,7 +81,7 @@ TEST_F(OpMmOutTest, OutputDim) {
 /// zeros().
 TEST_F(OpMmOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
-  ET_FORALL_REAL_TYPES_AND(Half, TEST_ENTRY);
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
   // TODO: Also add tests for half, complex, quantized, and other types. Easiest
   // way to do that would be to make TensorFactory support zeros() and ones()
diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp
index 029a74ca944..f8205ea601e 100644
--- a/kernels/test/op_mul_test.cpp
+++ b/kernels/test/op_mul_test.cpp
@@ -72,7 +72,7 @@ class OpMulOutTest : public OperatorTest {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_mul_enumerate_out_types<DTYPE_A, ScalarType::dtype>();
 
-    ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
   }
@@ -89,29 +89,116 @@ class OpMulOutTest : public OperatorTest {
 
     // Multiply two tensors
     op_mul_out(
-        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes), out);
-    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}));
+        tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}), tf.ones(sizes), out);
+    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}));
 
     op_mul_out(
         tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.zeros(sizes), out);
     EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.0, 0.0, 0.0, 0.0}));
 
     op_mul_out(
-        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
-        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
+        tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}),
+        tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}),
         out);
     EXPECT_TENSOR_CLOSE(
-        out, tf.make(sizes, /*data=*/{1.21, 4.84, 19.36, 77.44}));
+        out, tf.make(sizes, /*data=*/{1.5625, 6.25, 22.5625, 78.765625}));
   }
 
   void test_mul_enumerate_a_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_mul_enumerate_b_types<ScalarType::dtype>();
 
-    ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
   }
+
+  template <ScalarType DTYPE>
+  void test_optimized_path_ignores_leading_1_dimensions() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes1 = {1, 1, 2, 2};
+    const std::vector<int32_t> sizes2 = {1, 2, 2};
+
+    // Destination for the mul.
+    Tensor out = tf.zeros(sizes1);
+
+    // Multiply two tensors
+    op_mul_out(
+        tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes2), out);
+    EXPECT_TENSOR_CLOSE(out, tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8}));
+  }
+
+  template <ScalarType DTYPE>
+  void test_broadcast_a2b() {
+    TensorFactory<DTYPE> tf_a;
+
+    std::vector<std::vector<int32_t>> b_sizeses = {
+        {2},
+        {1, 2},
+    };
+    for (const auto& b_sizes : b_sizeses) {
+      // a and b of different shapes
+      Tensor a = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4});
+      Tensor b = tf_a.make(b_sizes, /*data=*/{2, 2});
+
+      // Destination for output of mul.
+      Tensor out = tf_a.zeros({2, 2});
+
+      // Check that it matches the expected output.
+      EXPECT_TENSOR_CLOSE(
+          op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8}));
+    }
+  }
+
+  template <ScalarType DTYPE>
+  void test_broadcast_b2a() {
+    TensorFactory<DTYPE> tf_a;
+    // a and b of different shapes
+    Tensor a = tf_a.make({2}, /*data=*/{2, 2});
+    Tensor b = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4});
+
+    // Destination for output of mul.
+    Tensor out = tf_a.zeros({2, 2});
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(
+        op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8}));
+  }
+
+  template <ScalarType DTYPE>
+  void test_scalar_input_broadcast() {
+    TensorFactory<DTYPE> tf_a;
+
+    // a is a 1d tensor and b is a scalar
+    Tensor a = tf_a.make({2}, /*data=*/{2, 2});
+    Tensor b = tf_a.make({}, /*data=*/{2});
+
+    // Destination for output of mul.
+    Tensor out = tf_a.make({2}, /*data=*/{2, 2});
+    Tensor expected = tf_a.make({2}, /*data=*/{4, 4});
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(op_mul_out(a, b, out), expected);
+    EXPECT_TENSOR_CLOSE(op_mul_out(b, a, out), expected);
+  }
+
+  template <ScalarType DTYPE>
+  void test_both_scalar_input_broadcast() {
+    TensorFactory<DTYPE> tf_a;
+
+    // a is a rank-1 scalar and b is a rank-0 scalar
+    Tensor a = tf_a.make({1}, /*data=*/{2});
+    Tensor b = tf_a.make({}, /*data=*/{2});
+
+    // Destination for output of mul.
+    Tensor out = tf_a.make({1}, /*data=*/{2});
+    Tensor expected = tf_a.make({1}, /*data=*/{4});
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(op_mul_out(a, b, out), expected);
+    EXPECT_TENSOR_CLOSE(op_mul_out(b, a, out), expected);
+  }
 };
 
 class OpMulScalarOutTest : public OperatorTest {
@@ -141,6 +228,14 @@ TEST_F(OpMulOutTest, DoubleTensors) {
   test_floating_point_mul_out<ScalarType::Double>();
 }
 
+TEST_F(OpMulOutTest, HalfTensors) {
+  test_floating_point_mul_out<ScalarType::Half>();
+}
+
+TEST_F(OpMulOutTest, BFloat16Tensors) {
+  test_floating_point_mul_out<ScalarType::BFloat16>();
+}
+
 TEST_F(OpMulOutTest, BoolTensors) {
   TensorFactory<ScalarType::Bool> tf;
 
@@ -165,8 +260,17 @@ TEST_F(OpMulOutTest, BoolTensors) {
   EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{false, false, true, false}));
 }
 
+TEST_F(OpMulOutTest, OptimizedPathIgnoresLeading1Dimensions) {
+#define ENUMERATE_TEST_ENTRY(ctype, dtype) \
+  test_optimized_path_ignores_leading_1_dimensions<ScalarType::dtype>();
+
+  ET_FORALL_FLOATHBF16_TYPES(ENUMERATE_TEST_ENTRY);
+
+#undef ENUMERATE_TEST_ENTRY
+}
+
 // Mismatched shape tests.
-TEST_F(OpMulOutTest, MismatchedInputShapesDies) {
+TEST_F(OpMulOutTest, MismatchedNonBroadcastableInputShapesDies) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen currently supports mismatched shapes";
   }
@@ -174,11 +278,11 @@ TEST_F(OpMulOutTest, MismatchedInputShapesDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // Input tensors with different shapes.
-  Tensor a = tf.ones(/*sizes=*/{1, 2});
+  Tensor a = tf.ones(/*sizes=*/{4, 2});
   Tensor b = tf.ones(/*sizes=*/{2, 2});
 
   // Output tensor; matches the shape of one of the inputs.
-  Tensor out = tf.zeros(/*sizes=*/{4});
+  Tensor out = tf.zeros(/*sizes=*/{8});
 
   // Multiplying the two mismatched tensors should cause an assertion and kill
   // the test process.
@@ -187,34 +291,16 @@ TEST_F(OpMulOutTest, MismatchedInputShapesDies) {
 
 // Broadcast tensor b's size to tensor a's size
 TEST_F(OpMulOutTest, BroadcastA2BTest) {
-  TensorFactory<ScalarType::Int> tf_a;
-
-  // a and b of different shapes
-  Tensor a = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4});
-  Tensor b = tf_a.make({2}, /*data=*/{2, 2});
-
-  // Destination for output of mul.
-  Tensor out = tf_a.zeros({2, 2});
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_CLOSE(
-      op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8}));
+  test_broadcast_a2b<ScalarType::Int>();
+  test_broadcast_a2b<ScalarType::Half>();
+  test_broadcast_a2b<ScalarType::BFloat16>();
 }
 
 // Broadcast tensor a's size to tensor b's size
 TEST_F(OpMulOutTest, BroadcastB2ATest) {
-  TensorFactory<ScalarType::Int> tf_a;
-
-  // a and b of different shapes
-  Tensor a = tf_a.make({2}, /*data=*/{2, 2});
-  Tensor b = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4});
-
-  // Destination for output of mul.
-  Tensor out = tf_a.zeros({2, 2});
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_CLOSE(
-      op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8}));
+  test_broadcast_b2a<ScalarType::Int>();
+  test_broadcast_b2a<ScalarType::Half>();
+  test_broadcast_b2a<ScalarType::BFloat16>();
 }
 
 // Broadcast tensor a and b's size to a new size c.
@@ -235,18 +321,15 @@ TEST_F(OpMulOutTest, BroadcastAB2CTest) {
 }
 
 TEST_F(OpMulOutTest, ScalarInputBroadcastTest) {
-  TensorFactory<ScalarType::Int> tf_a;
-
-  // a is a 1d tensor and b is a scalar
-  Tensor a = tf_a.make({2}, /*data=*/{2, 2});
-  Tensor b = tf_a.make({}, /*data=*/{2});
-
-  // Destination for output of mul.
-  Tensor out = tf_a.make({2}, /*data=*/{2, 2});
-  Tensor expected = tf_a.make({2}, /*data=*/{4, 4});
+  test_scalar_input_broadcast<ScalarType::Int>();
+  test_scalar_input_broadcast<ScalarType::Half>();
+  test_scalar_input_broadcast<ScalarType::BFloat16>();
+}
 
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_CLOSE(op_mul_out(a, b, out), expected);
+TEST_F(OpMulOutTest, BothScalarInputBroadcastTest) {
+  test_both_scalar_input_broadcast<ScalarType::Int>();
+  test_both_scalar_input_broadcast<ScalarType::Half>();
+  test_both_scalar_input_broadcast<ScalarType::BFloat16>();
 }
 
 TEST_F(OpMulOutTest, MismatchedOutputShapesDies) {
@@ -503,3 +586,29 @@ TEST_F(OpMulScalarOutTest, OptimizedSanityCheck) {
   // Check that it matches the expected output.
   EXPECT_TENSOR_CLOSE(out, tf.make(sizes, {2.6, 4.2, 9.2, 16.4}));
 }
+
+TEST_F(OpMulScalarOutTest, HalfSanityCheck) {
+  TensorFactory<ScalarType::Half> tf;
+
+  const std::vector<int32_t> sizes = {2, 2};
+
+  Tensor out = tf.zeros(sizes);
+
+  op_mul_scalar_out(tf.make(sizes, {1.3, 2.1, 4.6, 8.2}), 2.0, out);
+
+  // Check that it matches the expected output.
+  EXPECT_TENSOR_CLOSE(out, tf.make(sizes, {2.6, 4.2, 9.2, 16.4}));
+}
+
+TEST_F(OpMulScalarOutTest, BFloat16SanityCheck) {
+  TensorFactory<ScalarType::BFloat16> tf;
+
+  const std::vector<int32_t> sizes = {2, 2};
+
+  Tensor out = tf.zeros(sizes);
+
+  op_mul_scalar_out(tf.make(sizes, {1.3, 2.1, 4.6, 8.2}), 2.0, out);
+
+  // Check that it matches the expected output.
+  EXPECT_TENSOR_CLOSE(out, tf.make(sizes, {2.6, 4.2, 9.2, 16.4}));
+}
diff --git a/kernels/test/op_narrow_copy_test.cpp b/kernels/test/op_narrow_copy_test.cpp
new file mode 100644
index 00000000000..e453e46500a
--- /dev/null
+++ b/kernels/test/op_narrow_copy_test.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using exec_aten::ArrayRef;
+using exec_aten::optional;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+class OpNarrowCopyOutTest : public OperatorTest {
+ protected:
+  Tensor& op_narrow_copy_out(
+      const Tensor& in,
+      int64_t dim,
+      int64_t start,
+      int64_t length,
+      Tensor& out) {
+    return torch::executor::aten::narrow_copy_outf(
+        context_, in, dim, start, length, out);
+  }
+
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    // clang-format off
+    Tensor input = tf.make(
+      /*sizes=*/{3, 4},
+      /*data=*/{
+        1,   2,   3,   4, // [0, :]
+        5,   6,   7,   8, // [1, :]
+        9,  10,  11,  12, // [2, :]
+      });
+  
+    Tensor expected = tf.make(
+      /*sizes=*/{2, 4},
+      /*data=*/{
+        1,   2,   3,   4, // [0, :]
+        5,   6,   7,   8, // [1, :]
+      });
+    // clang-format on
+
+    Tensor out = tf.zeros({2, 4});
+    Tensor ret =
+        op_narrow_copy_out(input, /*dim=*/0, /*start=*/0, /*length=*/2, out);
+
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
+
+TEST_F(OpNarrowCopyOutTest, AllDtypesSupported) {
+#define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
+  ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpNarrowCopyOutTest, EmptyInputSupported) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor input = tf.ones({1, 0, 1});
+  Tensor out = tf.zeros({1, 0, 1});
+
+  Tensor expect = tf.ones({1, 0, 1});
+
+  Tensor ret =
+      op_narrow_copy_out(input, /*dim=*/0, /*start=*/0, /*length=*/1, out);
+  EXPECT_TENSOR_EQ(ret, out);
+  EXPECT_TENSOR_EQ(ret, expect);
+
+  ret = op_narrow_copy_out(input, /*dim=*/1, /*start=*/0, /*length=*/0, out);
+  EXPECT_TENSOR_EQ(ret, out);
+  EXPECT_TENSOR_EQ(ret, expect);
+
+  ret = op_narrow_copy_out(input, /*dim=*/2, /*start=*/0, /*length=*/1, out);
+  EXPECT_TENSOR_EQ(ret, out);
+  EXPECT_TENSOR_EQ(ret, expect);
+}
+
+TEST_F(OpNarrowCopyOutTest, ZeroLengthSupported) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor input = tf.ones({2, 3});
+  Tensor out = tf.ones({2, 0});
+
+  Tensor expect = tf.ones({2, 0});
+
+  Tensor ret =
+      op_narrow_copy_out(input, /*dim=*/1, /*start=*/1, /*length=*/0, out);
+  EXPECT_TENSOR_EQ(ret, out);
+  EXPECT_TENSOR_EQ(ret, expect);
+
+  ret = op_narrow_copy_out(input, /*dim=*/1, /*start=*/-1, /*length=*/0, out);
+  EXPECT_TENSOR_EQ(ret, out);
+  EXPECT_TENSOR_EQ(ret, expect);
+}
+
+TEST_F(OpNarrowCopyOutTest, ZeroDimInputDies) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor input = tf.ones({});
+  Tensor out = tf.ones({});
+
+  // The operation shall die whatever the end is.
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_narrow_copy_out(input, /*dim=*/0, /*start=*/0, /*length=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_narrow_copy_out(input, /*dim=*/0, /*start=*/1, /*length=*/1, out));
+}
+
+TEST_F(OpNarrowCopyOutTest, InvalidStart) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor input = tf.ones({2, 3});
+  Tensor out = tf.ones({2, 3});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_narrow_copy_out(input, /*dim=*/0, /*start=*/-3, /*length=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_narrow_copy_out(input, /*dim=*/1, /*start=*/4, /*length=*/0, out));
+}
+
+TEST_F(OpNarrowCopyOutTest, InvalidStartLengthCombination) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor input = tf.ones({2, 3});
+  Tensor out = tf.ones({2, 3});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_narrow_copy_out(input, /*dim=*/0, /*start=*/0, /*length=*/3, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_narrow_copy_out(input, /*dim=*/1, /*start=*/-1, /*length=*/2, out));
+}
+
+TEST_F(OpNarrowCopyOutTest, NegativeLengthDies) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor input = tf.ones({1, 1, 1});
+  Tensor out = tf.zeros({1, 1, 1});
+
+  // Some invalid length values.
+  const std::vector<int64_t> invalid_lengths = {-3, -2, -1};
+  for (int64_t length : invalid_lengths) {
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_narrow_copy_out(
+            input, /*dim=*/0, /*start=*/0, /*length=*/length, out));
+  }
+}
+
+TEST_F(OpNarrowCopyOutTest, DimOutOfBoundDies) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor input = tf.ones({1, 1, 1});
+  Tensor out = tf.zeros({1, 1, 1});
+
+  // Some invalid dim values.
+  const std::vector<int64_t> invalid_dims = {3, 4, 5, -4, -5, -6};
+  for (int64_t dim : invalid_dims) {
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_narrow_copy_out(input, dim, /*start=*/0, /*length=*/1, out));
+  }
+}
+
+TEST_F(OpNarrowCopyOutTest, MismatchedDtypesDies) {
+  TensorFactory<ScalarType::Int> tf_int;
+  TensorFactory<ScalarType::Float> tf_float;
+  Tensor input = tf_int.zeros({1, 2, 2});
+
+  // Size is compatible to the output, but a mismatched dtype.
+  Tensor out = tf_float.ones({1, 2, 2});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_narrow_copy_out(input, /*dim=*/0, /*start=*/0, /*length=*/1, out));
+}
diff --git a/kernels/test/op_native_batch_norm_test.cpp b/kernels/test/op_native_batch_norm_test.cpp
index 459a5e5a557..c6810f737fd 100644
--- a/kernels/test/op_native_batch_norm_test.cpp
+++ b/kernels/test/op_native_batch_norm_test.cpp
@@ -61,7 +61,7 @@ class OpNativeBatchNormLegitOutTest : public OperatorTest {
       exec_aten::Tensor& out0,
       exec_aten::Tensor& out1,
       exec_aten::Tensor& out2) {
-    exec_aten::RuntimeContext context{};
+    executorch::runtime::KernelRuntimeContext context{};
     return torch::executor::aten::_native_batch_norm_legit_outf(
         context,
         input,
diff --git a/kernels/test/op_native_group_norm_test.cpp b/kernels/test/op_native_group_norm_test.cpp
index 6bc4785ce4d..aab4d9d76a4 100644
--- a/kernels/test/op_native_group_norm_test.cpp
+++ b/kernels/test/op_native_group_norm_test.cpp
@@ -32,7 +32,7 @@ ::std::tuple<Tensor&, Tensor&, Tensor&> op_native_group_norm_out(
     Tensor& out0,
     Tensor& out1,
     Tensor& out2) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::native_group_norm_outf(
       context, input, weight, bias, N, C, HxW, group, eps, out0, out1, out2);
 }
diff --git a/kernels/test/op_ne_test.cpp b/kernels/test/op_ne_test.cpp
index 9603dee03c2..81ec9d01fce 100644
--- a/kernels/test/op_ne_test.cpp
+++ b/kernels/test/op_ne_test.cpp
@@ -15,10 +15,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::RuntimeContext;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
 class OpNeTest : public OperatorTest {
@@ -34,7 +34,7 @@ class OpNeTest : public OperatorTest {
     Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
     Tensor b = tf_input.make({2, 2}, {2, 2, 2, 2});
     Tensor out = tf_bool.zeros({2, 2});
-    RuntimeContext context{};
+    KernelRuntimeContext context{};
 
     torch::executor::aten::ne_outf(context, a, b, out);
     EXPECT_TENSOR_EQ(out, tf_bool.make({2, 2}, {false, true, false, true}));
diff --git a/kernels/test/op_pdist_forward_test.cpp b/kernels/test/op_pdist_forward_test.cpp
index a21c1eb8256..f022c9af94f 100644
--- a/kernels/test/op_pdist_forward_test.cpp
+++ b/kernels/test/op_pdist_forward_test.cpp
@@ -22,7 +22,7 @@ using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_pdist_forward_out(const Tensor& input, double p, Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::_pdist_forward_outf(context, input, p, out);
 }
 
diff --git a/kernels/test/op_pixel_shuffle_test.cpp b/kernels/test/op_pixel_shuffle_test.cpp
index 01c2d878b2e..a255f94b2ea 100644
--- a/kernels/test/op_pixel_shuffle_test.cpp
+++ b/kernels/test/op_pixel_shuffle_test.cpp
@@ -74,7 +74,7 @@ TEST_F(OpPixelShuffleOutTest, AllRealDtypesSupported) {
 TEST_F(OpPixelShuffleOutTest, LargerInputRank) {
   TensorFactory<ScalarType::Int> tf;
 
-  // Pixel shuffle allows a 4D (or higher) input tensor, make sure the extra
+  // Pixel shuffle allows a 3D (or higher) input tensor, make sure the extra
   // dimensions don't cause issues.
   Tensor a = tf.ones(/*sizes=*/{1, 4, 1, 4, 2, 2});
 
@@ -102,11 +102,8 @@ TEST_F(OpPixelShuffleOutTest, InvalidInputChannelsDies) {
 TEST_F(OpPixelShuffleOutTest, WrongInputRankDies) {
   TensorFactory<ScalarType::Int> tf;
 
-  // Pixel shuffle requires a 4D input tensor.
+  // Pixel shuffle requires a 3D or higher input tensor.
   Tensor a = tf.ones(/*sizes=*/{1, 2});
-
-  // NOTE: The wrong output rank dies for the portable kernel, but not the aten
-  // kernel.
   Tensor out = tf.zeros(/*sizes=*/{1, 2});
 
   // Using the wrong input shape should exit with an error code.
diff --git a/kernels/test/op_pixel_unshuffle_test.cpp b/kernels/test/op_pixel_unshuffle_test.cpp
new file mode 100644
index 00000000000..838aa4c6946
--- /dev/null
+++ b/kernels/test/op_pixel_unshuffle_test.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/kernels/test/supported_features.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using torch::executor::testing::SupportedFeatures;
+using torch::executor::testing::TensorFactory;
+
+class OpPixelUnshuffleOutTest : public OperatorTest {
+ protected:
+  Tensor& op_pixel_unshuffle_out(
+      const Tensor& self,
+      int64_t upscale_factor,
+      Tensor& out) {
+    return torch::executor::aten::pixel_unshuffle_outf(
+        context_, self, upscale_factor, out);
+  }
+
+  template <ScalarType DTYPE_IN>
+  void test_pixel_unshuffle() {
+    TensorFactory<DTYPE_IN> tf_in;
+
+    const std::vector<int32_t> sizes = {1, 1, 4, 4};
+    const std::vector<int32_t> out_sizes = {1, 4, 2, 2};
+
+    // Destination for the pixel_unshuffle.
+    Tensor out = tf_in.zeros(out_sizes);
+
+    op_pixel_unshuffle_out(
+        tf_in.make(
+            sizes, {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15}),
+        2,
+        out);
+    EXPECT_TENSOR_EQ(
+        out,
+        tf_in.make(
+            out_sizes, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}));
+  }
+};
+
+//
+// Correctness Tests
+//
+
+/**
+ * Uses the function templates above to test all input dtypes.
+ */
+TEST_F(OpPixelUnshuffleOutTest, AllRealDtypesSupported) {
+#define ENUMERATE_TEST_ENTRY(ctype, dtype) \
+  test_pixel_unshuffle<ScalarType::dtype>();
+
+  ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
+
+#undef ENUMERATE_TEST_ENTRY
+}
+
+TEST_F(OpPixelUnshuffleOutTest, LargerInputRank) {
+  TensorFactory<ScalarType::Int> tf;
+
+  // Pixel unshuffle allows a 3D (or higher) input tensor, make sure the extra
+  // dimensions don't cause issues.
+  Tensor a = tf.ones(/*sizes=*/{1, 4, 1, 1, 4, 4});
+
+  const std::vector<int32_t> out_sizes = {1, 4, 1, 4, 2, 2};
+  Tensor out = tf.zeros(out_sizes);
+
+  op_pixel_unshuffle_out(a, 2, out);
+  EXPECT_TENSOR_EQ(out, tf.ones(out_sizes));
+}
+
+// Mismatched shape tests.
+TEST_F(OpPixelUnshuffleOutTest, InvalidInputShapeDies) {
+  TensorFactory<ScalarType::Int> tf;
+
+  // Input tensors with invalid shapes. 7 is not divisible by downsample_factor
+  Tensor a = tf.ones(/*sizes=*/{1, 1, 7, 8});
+
+  Tensor out = tf.zeros(/*sizes=*/{1, 4, 4, 4});
+
+  // Using the wrong input shape should exit with an error code.
+  ET_EXPECT_KERNEL_FAILURE(context_, op_pixel_unshuffle_out(a, 2, out));
+}
+
+TEST_F(OpPixelUnshuffleOutTest, WrongInputRankDies) {
+  TensorFactory<ScalarType::Int> tf;
+
+  // Pixel unshuffle requires a 3D or higher input tensor.
+  Tensor a = tf.ones(/*sizes=*/{1, 2});
+  Tensor out = tf.zeros(/*sizes=*/{1, 2});
+
+  // Using the wrong input rank should exit with an error code.
+  ET_EXPECT_KERNEL_FAILURE(context_, op_pixel_unshuffle_out(a, 2, out));
+}
+
+TEST_F(OpPixelUnshuffleOutTest, DifferentDtypeDies) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Float> tf_float;
+
+  Tensor a = tf.ones(/*sizes=*/{1, 2, 12, 12});
+
+  // Pixel unshuffle requires two tensors with the same dtype.
+  Tensor out = tf_float.zeros(/*sizes=*/{1, 18, 4, 4});
+
+  // Using the wrong output dtype should exit with an error code.
+  ET_EXPECT_KERNEL_FAILURE(context_, op_pixel_unshuffle_out(a, 3, out));
+}
+
+TEST_F(OpPixelUnshuffleOutTest, NegativeUpscaleFactorDies) {
+  TensorFactory<ScalarType::Int> tf;
+  Tensor a = tf.ones(/*sizes=*/{1, 2, 12, 12});
+  Tensor out = tf.zeros(/*sizes=*/{1, 18, 4, 4});
+  // Using a negative upscale factor should exit with an error code.
+  ET_EXPECT_KERNEL_FAILURE(context_, op_pixel_unshuffle_out(a, -3, out));
+}
diff --git a/kernels/test/op_prod_test.cpp b/kernels/test/op_prod_test.cpp
index 3e9f7e6af14..f96eea9564c 100644
--- a/kernels/test/op_prod_test.cpp
+++ b/kernels/test/op_prod_test.cpp
@@ -23,7 +23,7 @@ using torch::executor::testing::TensorFactory;
 
 Tensor&
 op_prod_out(const Tensor& self, optional<ScalarType> dtype, Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::prod_outf(context, self, dtype, out);
 }
 
@@ -33,7 +33,7 @@ Tensor& op_prod_int_out(
     bool keepdim,
     optional<ScalarType> dtype,
     Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::prod_outf(
       context, self, dim, keepdim, dtype, out);
 }
diff --git a/kernels/test/op_reflection_pad1d_test.cpp b/kernels/test/op_reflection_pad1d_test.cpp
index 91f9da57b42..2c357ffe384 100644
--- a/kernels/test/op_reflection_pad1d_test.cpp
+++ b/kernels/test/op_reflection_pad1d_test.cpp
@@ -25,7 +25,7 @@ Tensor& op_reflection_pad1d_out(
     const Tensor& input,
     ArrayRef<int64_t> padding,
     Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::reflection_pad1d_outf(
       context, input, padding, out);
 }
diff --git a/kernels/test/op_reflection_pad2d_test.cpp b/kernels/test/op_reflection_pad2d_test.cpp
index 295c8772ceb..6e0c7780f51 100644
--- a/kernels/test/op_reflection_pad2d_test.cpp
+++ b/kernels/test/op_reflection_pad2d_test.cpp
@@ -25,7 +25,7 @@ Tensor& op_reflection_pad2d_out(
     const Tensor& input,
     ArrayRef<int64_t> padding,
     Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::reflection_pad2d_outf(
       context, input, padding, out);
 }
diff --git a/kernels/test/op_reflection_pad3d_test.cpp b/kernels/test/op_reflection_pad3d_test.cpp
index e49ec715496..8ef8b6154df 100644
--- a/kernels/test/op_reflection_pad3d_test.cpp
+++ b/kernels/test/op_reflection_pad3d_test.cpp
@@ -25,7 +25,7 @@ Tensor& op_reflection_pad3d_out(
     const Tensor& input,
     ArrayRef<int64_t> padding,
     Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::reflection_pad3d_outf(
       context, input, padding, out);
 }
diff --git a/kernels/test/op_replication_pad1d_test.cpp b/kernels/test/op_replication_pad1d_test.cpp
index 77ecc9d5a07..942a38e27ba 100644
--- a/kernels/test/op_replication_pad1d_test.cpp
+++ b/kernels/test/op_replication_pad1d_test.cpp
@@ -25,7 +25,7 @@ Tensor& op_replication_pad1d_out(
     const Tensor& input,
     ArrayRef<int64_t> padding,
     Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::replication_pad1d_outf(
       context, input, padding, out);
 }
diff --git a/kernels/test/op_replication_pad2d_test.cpp b/kernels/test/op_replication_pad2d_test.cpp
index af5c7cd7264..2b9147b575f 100644
--- a/kernels/test/op_replication_pad2d_test.cpp
+++ b/kernels/test/op_replication_pad2d_test.cpp
@@ -25,7 +25,7 @@ Tensor& op_replication_pad2d_out(
     const Tensor& input,
     ArrayRef<int64_t> padding,
     Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::replication_pad2d_outf(
       context, input, padding, out);
 }
diff --git a/kernels/test/op_replication_pad3d_test.cpp b/kernels/test/op_replication_pad3d_test.cpp
index 8f9c35219d9..61a23ffbaba 100644
--- a/kernels/test/op_replication_pad3d_test.cpp
+++ b/kernels/test/op_replication_pad3d_test.cpp
@@ -25,7 +25,7 @@ Tensor& op_replication_pad3d_out(
     const Tensor& input,
     ArrayRef<int64_t> padding,
     Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::replication_pad3d_outf(
       context, input, padding, out);
 }
diff --git a/kernels/test/op_roll_test.cpp b/kernels/test/op_roll_test.cpp
index dc7b23ca50e..16e09ec83f5 100644
--- a/kernels/test/op_roll_test.cpp
+++ b/kernels/test/op_roll_test.cpp
@@ -26,7 +26,7 @@ Tensor& op_roll_out(
     ArrayRef<int64_t> shifts,
     ArrayRef<int64_t> dims,
     Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::roll_outf(context, input, shifts, dims, out);
 }
 
diff --git a/kernels/test/op_scalar_tensor_test.cpp b/kernels/test/op_scalar_tensor_test.cpp
index 7a2f5ca9dab..482f6073a69 100644
--- a/kernels/test/op_scalar_tensor_test.cpp
+++ b/kernels/test/op_scalar_tensor_test.cpp
@@ -80,7 +80,7 @@ class OpScalarTensorOutTest : public OperatorTest {
     test_scalar_tensor_out_0d<ctype, ScalarType::dtype>(9); \
   }
 
-ET_FORALL_REAL_TYPES(GENERATE_TEST_0D)
+ET_FORALL_REAL_TYPES_AND3(Half, Bool, BFloat16, GENERATE_TEST_0D)
 
 #define GENERATE_TEST(ctype, dtype)                                    \
   TEST_F(OpScalarTensorOutTest, dtype##Tensors) {                      \
@@ -98,7 +98,7 @@ ET_FORALL_REAL_TYPES(GENERATE_TEST_0D)
     test_scalar_tensor_out_3d<ctype, ScalarType::dtype>(7);            \
   }
 
-ET_FORALL_REAL_TYPES(GENERATE_TEST)
+ET_FORALL_REAL_TYPES_AND3(Half, Bool, BFloat16, GENERATE_TEST)
 
 TEST_F(OpScalarTensorOutTest, InvalidOutShapeFails) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
diff --git a/kernels/test/op_scatter_test.cpp b/kernels/test/op_scatter_test.cpp
new file mode 100644
index 00000000000..83c112a8c34
--- /dev/null
+++ b/kernels/test/op_scatter_test.cpp
@@ -0,0 +1,654 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/kernels/test/supported_features.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+#include <cmath>
+
+using namespace ::testing;
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+class OpScatterSrcOutTest : public OperatorTest {
+ protected:
+  Tensor& op_scatter_src_out(
+      const Tensor& self,
+      int64_t dim,
+      const Tensor& index,
+      const Tensor& src,
+      Tensor& out) {
+    return torch::executor::aten::scatter_outf(
+        context_, self, dim, index, src, out);
+  }
+
+  // Common testing for the operator
+  template <ScalarType DATA_DTYPE>
+  void test_scatter_src_out() {
+    TensorFactory<ScalarType::Long> tf_index;
+    TensorFactory<DATA_DTYPE> tf_data;
+    const std::vector<int32_t> sizes = {3, 5};
+    // clang-format off
+    Tensor src = tf_data.make(
+      /*sizes=*/{2, 5},
+      {
+        1, 2, 3, 4, 5,
+        6, 7, 8, 9, 10
+      });
+    // clang-format on
+    Tensor in = tf_data.zeros(sizes);
+    Tensor out = tf_data.zeros(sizes);
+    // clang-format off
+    Tensor index = tf_index.make(
+      /*sizes=*/{2, 3},
+      {
+        0, 1, 2,
+        0, 1, 2
+      });
+    // clang-format on
+
+    // Valid input should give the expected output
+    op_scatter_src_out(in, 0, index, src, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out, tf_data.make(
+          sizes,
+          {
+            6, 0, 0, 0, 0,
+            0, 7, 0, 0, 0,
+            0, 0, 8, 0, 0
+          }));
+    // clang-format on
+
+    // Valid input should give the expected output
+    op_scatter_src_out(in, 1, index, src, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out, tf_data.make(sizes,
+        {
+          1, 2, 3, 0, 0,
+          6, 7, 8, 0, 0,
+          0, 0, 0, 0, 0
+        }));
+
+    src = tf_data.make(
+        /*sizes=*/{2, 3, 3},
+        {
+          // [0, :, :]
+          1,  2,  3,
+          4,  5,  6,
+          7,  8,  9,
+
+          // [1, :, :]
+          10, 11, 12,
+          13, 14, 15,
+          16, 17, 18
+        });
+    // clang-format on
+    in = tf_data.ones(/*sizes=*/{2, 3, 3});
+    out = tf_data.zeros(/*sizes=*/{2, 3, 3});
+    // clang-format off
+    index = tf_index.make(
+      /*sizes=*/{1, 3, 2},
+      {
+        0, 1,
+        1, 2,
+        0, 2
+      });
+    // clang-format on
+
+    op_scatter_src_out(in, 1, index, src, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out,
+        tf_data.make(
+            /*sizes=*/{2, 3, 3},
+            {
+              // [0, :, :]
+              7, 1,  1,
+              4, 2,  1,
+              1, 8, 1,
+
+              // [1, :, :]
+              1, 1,  1,
+              1, 1,  1,
+              1, 1,  1
+            }));
+    // clang-format on
+
+    out = tf_data.zeros(/*sizes=*/{2, 3, 3});
+    op_scatter_src_out(in, 2, index, src, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out,
+        tf_data.make(
+            /*sizes=*/{2, 3, 3},
+            {
+              // [0, :, :]
+              1, 2, 1,
+              1, 4, 5,
+              7, 1, 8,
+
+              // [1, :, :]
+              1, 1, 1,
+              1, 1, 1,
+              1, 1, 1
+            }));
+    // clang-format on
+  }
+
+  // Invalid dimensions
+  template <ScalarType DATA_DTYPE>
+  void test_scatter_src_out_invalid_dim() {
+    TensorFactory<ScalarType::Long> tf_index;
+    TensorFactory<DATA_DTYPE> tf_data;
+    const std::vector<int32_t> sizes = {3, 5};
+    // clang-format off
+    Tensor src = tf_data.make(/*sizes=*/{2, 5},
+      {
+        1, 2, 3, 4, 5,
+        6, 7, 8, 9, 10
+      });
+    Tensor index = tf_index.make(/*sizes=*/{2, 3},
+      {
+        0, 1, 2,
+        0, 1, 2
+      });
+    // clang-format on
+    Tensor self = tf_data.zeros(sizes);
+    Tensor out = tf_data.zeros(sizes);
+
+    // Invalid dim should die
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_src_out(self, -3, index, src, out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_src_out(self, 2, index, src, out));
+
+    // Self, index and src hsould have same number of dimensions
+    src = tf_data.zeros(/*sizes=*/{2, 2, 2});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_src_out(self, 0, index, src, out));
+
+    src = tf_data.zeros(/*sizes=*/{5, 5});
+    index = tf_index.zeros(/*sizes=*/{2, 2, 2});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_src_out(self, 0, index, src, out));
+
+    // Size of dimension of index should be smaller than the size of that
+    // dimension of src
+    index = tf_index.zeros(/*sizes=*/{4, 6});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_src_out(self, 0, index, src, out));
+
+    // Size of dimension of index should be smaller than the size of that
+    // dimension of self if dimension != dim
+    index = tf_index.zeros(/*sizes=*/{4, 5});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_src_out(self, 1, index, src, out));
+
+    // Index out of bound for self in dim
+    index = tf_index.make(/*sizes=*/{2, 3}, {0, 1, 3, 0, 1, 3});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_src_out(self, 0, index, src, out));
+  }
+};
+
+class OpScatterValueOutTest : public OperatorTest {
+ protected:
+  Tensor& op_scatter_value_out(
+      const Tensor& self,
+      int64_t dim,
+      const Tensor& index,
+      const Scalar& value,
+      Tensor& out) {
+    return torch::executor::aten::scatter_outf(
+        context_, self, dim, index, value, out);
+  }
+
+  // Common testing for the operator
+  template <ScalarType DATA_DTYPE>
+  void test_scatter_value_out() {
+    TensorFactory<ScalarType::Long> tf_index;
+    TensorFactory<DATA_DTYPE> tf_data;
+
+    const Scalar& value = 1;
+
+    const std::vector<int32_t> sizes = {3, 5};
+    Tensor self = tf_data.zeros(sizes);
+    Tensor out = tf_data.zeros(sizes);
+    Tensor index = tf_index.make({2, 3}, {0, 1, 2, 0, 1, 2});
+
+    op_scatter_value_out(self, 0, index, value, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out, tf_data.make(
+          sizes,
+          {
+            1, 0, 0,  0, 0,
+            0, 1, 0,  0, 0,
+            0, 0, 1, 0, 0
+          }));
+    // clang-format on
+
+    op_scatter_value_out(self, 1, index, value, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out, tf_data.make(sizes,
+        {
+          1, 1, 1, 0, 0,
+          1, 1, 1, 0, 0,
+          0, 0, 0, 0, 0
+        }));
+
+    const Scalar& value2 = 2;
+    self = tf_data.ones(/*sizes=*/{2, 3, 3});
+    out = tf_data.zeros(/*sizes=*/{2, 3, 3});
+    // clang-format off
+    index = tf_index.make(
+      /*sizes=*/{1, 3, 2},
+      {
+        0, 1,
+        1, 2,
+        0, 2
+      });
+    // clang-format on
+
+    op_scatter_value_out(self, 1, index, value2, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out,
+        tf_data.make(
+            /*sizes=*/{2, 3, 3},
+            {
+              // [0, :, :]
+              2, 1, 1,
+              2, 2, 1,
+              1, 2, 1,
+
+              // [1, :, :]
+              1, 1, 1,
+              1, 1, 1,
+              1, 1, 1
+            }));
+    // clang-format on
+
+    out = tf_data.zeros(/*sizes=*/{2, 3, 3});
+    op_scatter_value_out(self, 2, index, value2, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out,
+        tf_data.make(
+            /*sizes=*/{2, 3, 3},
+            {
+              // [0, :, :]
+              2, 2, 1,
+              1, 2, 2,
+              2, 1, 2,
+
+              // [1, :, :]
+              1, 1, 1,
+              1, 1, 1,
+              1, 1, 1
+            }));
+    // clang-format on
+  }
+
+  // Invalid dimensions
+  template <ScalarType DATA_DTYPE>
+  void test_scatter_value_out_invalid_dim() {
+    TensorFactory<ScalarType::Long> tf_index;
+    TensorFactory<DATA_DTYPE> tf_data;
+    // clang-format off
+    Tensor self = tf_data.make(/*sizes=*/{2, 5},
+      {
+        1, 2, 3, 4, 5,
+        6, 7, 8, 9, 10
+      });
+    const std::vector<int32_t> sizes = {2, 3};
+    Tensor index = tf_index.make(sizes,
+      {
+        0, 1, 0,
+        1, 0, 1,
+      });
+    // clang-format on
+    const Scalar& value = 1;
+    Tensor out = tf_data.zeros(sizes);
+
+    // Invalid dim should die
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_value_out(self, -3, index, value, out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_value_out(self, 2, index, value, out));
+
+    // Self and index hsould have same number of dimensions
+    index = tf_index.zeros(/*sizes=*/{2, 2, 2});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_value_out(self, 0, index, value, out));
+
+    // Size of dimension of index should be smaller than the size of that
+    // dimension of self if dimension != dim
+    index = tf_index.zeros(/*sizes=*/{3, 5});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_value_out(self, 1, index, value, out));
+
+    // Index out of bound for self in dim
+    index = tf_index.make(/*sizes=*/{2, 3}, {0, 1, 2, 0, 1, 2});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_value_out(self, 0, index, value, out));
+  }
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    TensorFactory<ScalarType::Int> tf;
+    TensorFactory<ScalarType::Long> tf_index;
+
+    Tensor input = tf.ones({2, 3, 4});
+    Tensor index = tf_index.zeros({2, 3, 4});
+    const Scalar& value = 1;
+    Tensor expected = tf.ones({2, 3, 4});
+    Tensor out = tf.zeros(out_shape, dynamism);
+
+    op_scatter_value_out(input, 2, index, value, out);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
+
+TEST_F(OpScatterSrcOutTest, AllValidInputOutputSupport) {
+#define TEST_ENTRY(CTYPE, DTYPE) test_scatter_src_out<ScalarType::DTYPE>();
+  ET_FORALL_REAL_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpScatterSrcOutTest, InvalidDimensionsDies) {
+#define TEST_ENTRY(CTYPE, DTYPE) \
+  test_scatter_src_out_invalid_dim<ScalarType::DTYPE>();
+  ET_FORALL_REAL_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpScatterValueOutTest, AllValidInputOutputSupport) {
+#define TEST_ENTRY(CTYPE, DTYPE) test_scatter_value_out<ScalarType::DTYPE>();
+  ET_FORALL_REAL_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpScatterValueOutTest, InfinityAndNANTest) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+  // clang-format off
+  Tensor self = tf_data.make(
+      /*sizes=*/{2, 5},
+      {
+        0.0, -INFINITY,        NAN,      2.33, NAN,
+        NAN,  INFINITY,  -INFINITY, -INFINITY, 2.33
+      });
+  // clang-format on
+  Tensor index = tf_index.make({2, 3}, {0, 1, 0, 1, 0, 1});
+  const Scalar& value = INFINITY;
+  Tensor out = tf_data.zeros({2, 5});
+
+  // Valid input should give the expected output
+  op_scatter_value_out(self, 0, index, value, out);
+  // clang-format off
+  EXPECT_TENSOR_CLOSE(
+      out,
+      tf_data.make(/*sizes=*/{2, 5},
+      {
+        INFINITY, INFINITY, INFINITY,      2.33, NAN,
+        INFINITY, INFINITY, INFINITY, -INFINITY, 2.33
+      }));
+  // clang-format on
+}
+
+TEST_F(OpScatterValueOutTest, InvalidDimensionsDies) {
+#define TEST_ENTRY(CTYPE, DTYPE) \
+  test_scatter_value_out_invalid_dim<ScalarType::DTYPE>();
+  ET_FORALL_REAL_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpScatterValueOutTest, MismatchedInputDtypesDies) {
+  TensorFactory<ScalarType::Byte> tf_byte;
+  TensorFactory<ScalarType::Char> tf_char;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  Tensor self = tf_char.make({2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  const std::vector<int32_t> sizes = {2, 3};
+  Tensor index = tf_byte.make(sizes, {0, 1, 0, 0, 1, 0});
+  const Scalar& value = 5;
+  Tensor out = tf_char.zeros(sizes);
+
+  // Types other than long for index should die
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_scatter_value_out(self, 0, index, value, out));
+
+  // Mismatched dtype of self and out should die
+  self = tf_byte.make(/*sizes=*/{2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  index = tf_long.make(sizes, {0, 1, 0, 1, 0, 1});
+  out = tf_char.zeros(sizes);
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_scatter_value_out(self, 0, index, value, out));
+}
+
+TEST_F(OpScatterValueOutTest, DynamicShapeUpperBoundSameAsExpected) {
+  test_dynamic_shape(
+      {2, 3, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+}
+
+TEST_F(OpScatterValueOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+  test_dynamic_shape(
+      {10, 10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+}
+
+TEST_F(OpScatterValueOutTest, DynamicShapeUnbound) {
+  if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
+    GTEST_SKIP() << "Dynamic shape not supported";
+  }
+  test_dynamic_shape(
+      {1, 1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
+}
+
+TEST_F(OpScatterValueOutTest, EmptyIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.ones({2, 5});
+  Tensor index = tf_index.zeros({2, 0, 3});
+  const Scalar& value = 5;
+  Tensor out = tf_data.zeros({2, 5});
+  op_scatter_value_out(self, 0, index, value, out);
+  EXPECT_TENSOR_CLOSE(out, tf_data.ones({2, 5}));
+}
+
+TEST_F(OpScatterValueOutTest, ValidZeroDim) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({}, {3.14});
+  Tensor index = tf_index.zeros({});
+  const Scalar& value = 5;
+  Tensor out = tf_data.zeros({});
+  op_scatter_value_out(self, 0, index, value, out);
+  EXPECT_TENSOR_CLOSE(out, tf_data.make({}, {5}));
+}
+
+TEST_F(OpScatterValueOutTest, InvalidZeroDimInput) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.ones({});
+  Tensor index = tf_index.make({2, 3}, {0, 0, 0, 0, 0, 0});
+  const Scalar& value = 5;
+  Tensor out = tf_data.zeros({});
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_scatter_value_out(self, 0, index, value, out));
+}
+
+TEST_F(OpScatterValueOutTest, InvalidZeroDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({2, 3}, {1, 2, 3, 4, 5, 6});
+  Tensor index = tf_index.make({}, {2});
+  const Scalar& value = 5;
+  Tensor out = tf_data.zeros({2, 3});
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_scatter_value_out(self, 1, index, value, out));
+}
+
+TEST_F(OpScatterValueOutTest, ValidZeroDimInputAndOneDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({}, {3.14});
+  Tensor index = tf_index.make({3}, {0, 0, 0});
+  const Scalar& value = 5;
+  Tensor out = tf_data.make({}, {2.71});
+  op_scatter_value_out(self, 0, index, value, out);
+  EXPECT_TENSOR_CLOSE(out, tf_data.make({}, {5}));
+}
+
+TEST_F(OpScatterValueOutTest, ValidOneDimInputAndZeroDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({3}, {10, 20, 30});
+  Tensor index = tf_index.make({}, {2});
+  const Scalar& value = 5;
+  Tensor out = tf_data.make({3}, {1729, 1729, 1729});
+  op_scatter_value_out(self, 0, index, value, out);
+  EXPECT_TENSOR_CLOSE(out, tf_data.make({3}, {10, 20, 5}));
+}
+
+TEST_F(OpScatterValueOutTest, InvalidZeroDimInputAndOneDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({}, {3.14});
+  Tensor index = tf_index.make({3}, {10, 100, 1000});
+  const Scalar& value = 5;
+  Tensor out = tf_data.make({}, {2.71});
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_scatter_value_out(self, 0, index, value, out));
+}
+
+TEST_F(OpScatterValueOutTest, InvalidOneDimInputAndZeroDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({3}, {10, 20, 30});
+  Tensor index = tf_index.make({}, {100});
+  const Scalar& value = 5;
+  Tensor out = tf_data.make({3}, {1729, 1729, 1729});
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_scatter_value_out(self, 0, index, value, out));
+}
+
+TEST_F(OpScatterSrcOutTest, EmptyIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.ones({2, 5});
+  Tensor index = tf_index.zeros({2, 0, 3});
+  Tensor src = tf_data.ones({1, 1, 4});
+  Tensor out = tf_data.zeros({2, 5});
+  op_scatter_src_out(self, 0, index, src, out);
+  EXPECT_TENSOR_CLOSE(out, tf_data.ones({2, 5}));
+}
+
+TEST_F(OpScatterSrcOutTest, ValidZeroDim) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({}, {3.14});
+  Tensor index = tf_index.zeros({});
+  Tensor src = tf_data.make({}, {5});
+  Tensor out = tf_data.zeros({});
+  op_scatter_src_out(self, 0, index, src, out);
+  EXPECT_TENSOR_CLOSE(out, tf_data.make({}, {5}));
+}
+
+TEST_F(OpScatterSrcOutTest, InvalidZeroDimInput) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.ones({});
+  Tensor index = tf_index.make({2, 3}, {0, 0, 0, 0, 0, 0});
+  Tensor src = tf_data.make({}, {5});
+  Tensor out = tf_data.zeros({});
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_scatter_src_out(self, 0, index, src, out));
+}
+
+TEST_F(OpScatterSrcOutTest, InvalidZeroDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({2, 3}, {1, 2, 3, 4, 5, 6});
+  Tensor index = tf_index.make({}, {2});
+  Tensor src = tf_data.make({}, {5});
+  Tensor out = tf_data.zeros({2, 3});
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_scatter_src_out(self, 1, index, src, out));
+}
+
+TEST_F(OpScatterSrcOutTest, ValidZeroDimInputAndOneDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({}, {3.14});
+  Tensor index = tf_index.make({3}, {0, 0, 0});
+  Tensor src = tf_data.make({3}, {5, 5, 5});
+  Tensor out = tf_data.make({}, {2.71});
+  op_scatter_src_out(self, 0, index, src, out);
+  EXPECT_TENSOR_CLOSE(out, tf_data.make({}, {5}));
+}
+
+TEST_F(OpScatterSrcOutTest, ValidOneDimInputAndZeroDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({3}, {10, 20, 30});
+  Tensor index = tf_index.make({}, {2});
+  Tensor src = tf_data.make({}, {5});
+  Tensor out = tf_data.make({3}, {1729, 1729, 1729});
+  op_scatter_src_out(self, 0, index, src, out);
+  EXPECT_TENSOR_CLOSE(out, tf_data.make({3}, {10, 20, 5}));
+}
+
+TEST_F(OpScatterSrcOutTest, InvalidZeroDimInputAndOneDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({}, {3.14});
+  Tensor index = tf_index.make({3}, {10, 100, 1000});
+  Tensor src = tf_data.make({}, {5});
+  Tensor out = tf_data.make({}, {2.71});
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_scatter_src_out(self, 0, index, src, out));
+}
+
+TEST_F(OpScatterSrcOutTest, InvalidOneDimInputAndZeroDimIndex) {
+  TensorFactory<ScalarType::Long> tf_index;
+  TensorFactory<ScalarType::Float> tf_data;
+
+  Tensor self = tf_data.make({3}, {10, 20, 30});
+  Tensor index = tf_index.make({}, {100});
+  Tensor src = tf_data.make({}, {5});
+  Tensor out = tf_data.make({3}, {1729, 1729, 1729});
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_scatter_src_out(self, 0, index, src, out));
+}
diff --git a/kernels/test/op_slice_copy_test.cpp b/kernels/test/op_slice_copy_test.cpp
index 4c04e4bf51c..9aaf6f18dbc 100644
--- a/kernels/test/op_slice_copy_test.cpp
+++ b/kernels/test/op_slice_copy_test.cpp
@@ -475,6 +475,25 @@ TEST_F(OpSliceCopyTensorOutTest, EmptySizeInputDies) {
           input, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/1, out));
 }
 
+TEST_F(OpSliceCopyTensorOutTest, ZeroLengthSupported) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor input = tf.ones({2, 3});
+  Tensor out = tf.ones({2, 0});
+
+  Tensor expect = tf.ones({2, 0});
+
+  Tensor ret = op_slice_copy_tensor_out(
+      input, /*dim=*/1, /*start=*/1, /*end=*/1, /*step=*/1, out);
+  EXPECT_TENSOR_EQ(ret, out);
+  EXPECT_TENSOR_EQ(ret, expect);
+
+  ret = op_slice_copy_tensor_out(
+      input, /*dim=*/1, /*start=*/-1, /*end=*/-1, /*step=*/1, out);
+  EXPECT_TENSOR_EQ(ret, out);
+  EXPECT_TENSOR_EQ(ret, expect);
+}
+
 TEST_F(OpSliceCopyTensorOutTest, NonPostiveStepsDies) {
   TensorFactory<ScalarType::Int> tf;
 
diff --git a/kernels/test/op_slice_scatter_test.cpp b/kernels/test/op_slice_scatter_test.cpp
index 4901f832a33..1d5e972ef2e 100644
--- a/kernels/test/op_slice_scatter_test.cpp
+++ b/kernels/test/op_slice_scatter_test.cpp
@@ -49,7 +49,7 @@ class OpSliceScatterTensorOutTest : public OperatorTest {
         5,   6,   7,   8, // [1, :]
         9,  10,  11,  12, // [2, :]
       });
-  
+
     // op_slice_scatter_out(input, src, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out),
     // src shape should equal to input[0:2:1, :]
     Tensor src = tf.make(
@@ -670,7 +670,7 @@ TEST_F(OpSliceScatterTensorOutTest, LegalStepsSupported) {
 /// zeros().
 TEST_F(OpSliceScatterTensorOutTest, AllRealDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
   // TODO: Also add tests for half, complex, quantized, and other types. Easiest
   // way to do that would be to make TensorFactory support zeros() and ones()
@@ -863,3 +863,24 @@ TEST_F(OpSliceScatterTensorOutTest, DynamicShapeTest) {
   EXPECT_TENSOR_EQ(ret_default_end, out);
   EXPECT_TENSOR_EQ(ret_default_end, expected);
 }
+
+TEST_F(OpSliceScatterTensorOutTest, LargeEndValue) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor input = tf.zeros({1, 1, 2, 5, 3, 3});
+  Tensor src = tf.ones({1, 1, 2, 5, 3, 3});
+
+  Tensor out = tf.zeros({1, 1, 2, 5, 3, 3});
+  Tensor expected = tf.ones({1, 1, 2, 5, 3, 3});
+
+  Tensor ret = op_slice_scatter_out(
+      input,
+      src,
+      /*dim=*/1,
+      /*start=*/0,
+      /*end=*/9223372036854775807,
+      /*step=*/1,
+      out);
+  EXPECT_TENSOR_EQ(ret, out);
+  EXPECT_TENSOR_EQ(ret, expected);
+}
diff --git a/kernels/test/op_sub_test.cpp b/kernels/test/op_sub_test.cpp
index 210f9891d1e..9f795516723 100644
--- a/kernels/test/op_sub_test.cpp
+++ b/kernels/test/op_sub_test.cpp
@@ -198,6 +198,31 @@ TEST_F(OpSubOutTest, BroadcastScalarSupported2) {
 
   Tensor ret = tf.make({3, 1, 1}, {6, 4, 0});
   EXPECT_TENSOR_EQ(out, ret);
+
+  std::swap(a, b);
+  out = tf.zeros({3, 1, 1});
+  op_sub_out(a, b, 1, out);
+  ret = tf.make({3, 1, 1}, {-6, -4, 0});
+  EXPECT_TENSOR_EQ(out, ret);
+}
+
+TEST_F(OpSubOutTest, BroadcastScalarRank0Supported) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor a = tf.make({1}, {5});
+  Tensor b = tf.make({}, {2});
+
+  Tensor out = tf.zeros({1});
+
+  op_sub_out(a, b, 1, out);
+
+  Tensor ret = tf.make({1}, {3});
+  EXPECT_TENSOR_EQ(out, ret);
+
+  op_sub_out(b, a, 1, out);
+
+  ret = tf.make({1}, {-3});
+  EXPECT_TENSOR_EQ(out, ret);
 }
 
 //
@@ -265,15 +290,15 @@ TEST_F(OpSubOutTest, BoolOutputWithIntegralInput) {
   ET_EXPECT_KERNEL_FAILURE(context_, op_sub_out(a, b, /*alpha=*/1, out));
 }
 
-TEST_F(OpSubOutTest, MismatchedInputShapesDies) {
+TEST_F(OpSubOutTest, MismatchedNonBroadcastableInputShapesDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // Subtrahend and minuend with different shapes.
-  Tensor a = tf.ones(/*sizes=*/{4});
+  Tensor a = tf.ones(/*sizes=*/{4, 2});
   Tensor b = tf.ones(/*sizes=*/{2, 2});
 
   // Destination for the subtraction; matches the shape of one of the inputs.
-  Tensor out = tf.zeros(/*sizes=*/{4});
+  Tensor out = tf.zeros(/*sizes=*/{8});
 
   // Performing substraction on two mismatched tensors should cause an assertion
   // and kill the test process.
diff --git a/kernels/test/op_to_copy_test.cpp b/kernels/test/op_to_copy_test.cpp
index 1cc892dedbe..0a6529e736d 100644
--- a/kernels/test/op_to_copy_test.cpp
+++ b/kernels/test/op_to_copy_test.cpp
@@ -36,7 +36,9 @@ typedef std::map<
           std::type_index,
           std::variant<
             std::vector<float>,
-            std::vector<double>>>
+            std::vector<double>,
+            std::vector<exec_aten::Half>,
+            std::vector<exec_aten::BFloat16>>>
         FloatingTypeToDataMap;
 
 typedef std::map<
@@ -309,9 +311,9 @@ TEST_F(OpToTest, AllDtypesSupported) {
       ScalarType::OUTPUT_DTYPE>(test_cases);
 
 #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
-  ET_FORALL_REAL_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
+  ET_FORALL_REALHBF16_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
 
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 
 #undef TEST_ENTRY
 #undef TEST_KERNEL
@@ -323,14 +325,14 @@ TEST_F(OpToTest, BoolTests) {
 #define TEST_TO_BOOL(INPUT_CTYPE, INPUT_DTYPE)               \
   test_runner_to_bool<INPUT_CTYPE, ScalarType::INPUT_DTYPE>( \
       test_case_to_bool, result_to_bool);
-  ET_FORALL_REAL_TYPES(TEST_TO_BOOL);
+  ET_FORALL_REALHBF16_TYPES(TEST_TO_BOOL);
 
   std::vector<uint8_t> test_case_from_bool = {true, true, false};
   std::vector<double> result_from_bool = {1.0, 1.0, 0};
 #define TEST_FROM_BOOL(OUTPUT_CTYPE, OUTPUT_DTYPE)               \
   test_runner_from_bool<OUTPUT_CTYPE, ScalarType::OUTPUT_DTYPE>( \
       test_case_from_bool, result_from_bool);
-  ET_FORALL_REAL_TYPES(TEST_FROM_BOOL);
+  ET_FORALL_REALHBF16_TYPES(TEST_FROM_BOOL);
 }
 
 TEST_F(OpToTest, NanInfSupported) {
@@ -349,9 +351,9 @@ TEST_F(OpToTest, NanInfSupported) {
       ScalarType::OUTPUT_DTYPE>(test_cases);
 
 #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
-  ET_FORALL_FLOAT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
+  ET_FORALL_FLOATHBF16_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
 
-  ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
 
 #undef TEST_ENTRY
 #undef TEST_KERNEL
@@ -381,6 +383,13 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) {
       -0.30919688936285893988};
   // clang-format on
 
+  std::vector<exec_aten::Half> half_data;
+  std::vector<exec_aten::BFloat16> bf16_data;
+  for (auto d : double_data) {
+    half_data.emplace_back(d);
+    bf16_data.emplace_back(d);
+  }
+
   std::vector<int64_t> int64_data = {
       -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};
   std::vector<int32_t> int32_data = {
@@ -394,6 +403,8 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) {
   FloatingTypeToDataMap floating_point_data;
   floating_point_data[typeid(float)] = float_data;
   floating_point_data[typeid(double)] = double_data;
+  floating_point_data[typeid(exec_aten::Half)] = half_data;
+  floating_point_data[typeid(exec_aten::BFloat16)] = bf16_data;
 
   // Gathering all int data together for better traversial
   IntTypeToDataMap int_data;
@@ -412,7 +423,7 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) {
 #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
   ET_FORALL_INT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
 
-  ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
 }
 
 TEST_F(OpToTest, MismatchedSizesDie) {
diff --git a/kernels/test/op_topk_test.cpp b/kernels/test/op_topk_test.cpp
new file mode 100644
index 00000000000..44a709687f0
--- /dev/null
+++ b/kernels/test/op_topk_test.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using exec_aten::IntArrayRef;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::runtime::MemoryAllocator;
+using torch::executor::testing::TensorFactory;
+
+class TempMemoryAllocator final : public MemoryAllocator {
+ private:
+  // We allocate a little more than requested and use that memory as a node in
+  // a linked list, pushing the allocated buffers onto a list that's iterated
+  // and freed when the KernelRuntimeContext is destroyed.
+  struct AllocationNode {
+    void* data;
+    AllocationNode* next;
+  };
+
+  AllocationNode* head_ = nullptr;
+
+ public:
+  TempMemoryAllocator() : MemoryAllocator(0, nullptr) {}
+
+  void* allocate(size_t size, size_t alignment = kDefaultAlignment) override {
+    if (!isPowerOf2(alignment)) {
+      ET_LOG(Error, "Alignment %zu is not a power of 2", alignment);
+      return nullptr;
+    }
+
+    // Allocate enough memory for the node, the data and the alignment bump.
+    size_t alloc_size = sizeof(AllocationNode) + size + alignment;
+    void* node_memory = malloc(alloc_size);
+
+    // If allocation failed, log message and return nullptr.
+    if (node_memory == nullptr) {
+      ET_LOG(Error, "Failed to allocate %zu bytes", alloc_size);
+      return nullptr;
+    }
+
+    // Compute data pointer.
+    uint8_t* data_ptr =
+        reinterpret_cast<uint8_t*>(node_memory) + sizeof(AllocationNode);
+
+    // Align the data pointer.
+    void* aligned_data_ptr = alignPointer(data_ptr, alignment);
+
+    // Assert that the alignment didn't overflow the allocated memory.
+    ET_DCHECK_MSG(
+        reinterpret_cast<uintptr_t>(aligned_data_ptr) + size <=
+            reinterpret_cast<uintptr_t>(node_memory) + alloc_size,
+        "aligned_data_ptr %p + size %zu > node_memory %p + alloc_size %zu",
+        aligned_data_ptr,
+        size,
+        node_memory,
+        alloc_size);
+
+    // Construct the node.
+    AllocationNode* new_node = reinterpret_cast<AllocationNode*>(node_memory);
+    new_node->data = aligned_data_ptr;
+    new_node->next = head_;
+    head_ = new_node;
+
+    // Return the aligned data pointer.
+    return head_->data;
+  }
+
+  void reset() override {
+    AllocationNode* current = head_;
+    while (current != nullptr) {
+      AllocationNode* next = current->next;
+      free(current);
+      current = next;
+    }
+    head_ = nullptr;
+  }
+
+  ~TempMemoryAllocator() override {
+    reset();
+  }
+};
+
+std::tuple<Tensor&, Tensor&> op_topk_values(
+    const Tensor& input,
+    int64_t k,
+    int64_t dim,
+    bool largest,
+    bool sorted,
+    Tensor& values,
+    Tensor& indices) {
+  TempMemoryAllocator allocator = TempMemoryAllocator();
+  executorch::runtime::KernelRuntimeContext context(nullptr, &allocator);
+  return torch::executor::aten::topk_outf(
+      context, input, k, dim, largest, sorted, values, indices);
+}
+
+class OpTopkValuesTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    torch::executor::runtime_init();
+  }
+};
+
+TEST_F(OpTopkValuesTest, SmokeTest) {
+  TensorFactory<ScalarType::Float> tfFloat;
+  TensorFactory<ScalarType::Long> tfLong;
+
+  Tensor input =
+      tfFloat.make({3, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  int64_t k = 2;
+  int64_t dim = 0;
+  bool largest = true;
+  bool sorted = true;
+  Tensor values = tfFloat.zeros({2, 2, 2});
+  Tensor indices = tfLong.zeros({2, 2, 2});
+  Tensor values_expected = tfFloat.make({2, 2, 2}, {9, 10, 11, 12, 5, 6, 7, 8});
+  Tensor indices_expected = tfLong.make({2, 2, 2}, {2, 2, 2, 2, 1, 1, 1, 1});
+  op_topk_values(input, k, dim, largest, sorted, values, indices);
+  EXPECT_TENSOR_CLOSE(values, values_expected);
+  EXPECT_TENSOR_EQ(indices, indices_expected);
+}
diff --git a/kernels/test/op_trunc_test.cpp b/kernels/test/op_trunc_test.cpp
index f78fb9d37fa..d380886b29e 100644
--- a/kernels/test/op_trunc_test.cpp
+++ b/kernels/test/op_trunc_test.cpp
@@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_trunc_out(const Tensor& a, Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::aten::trunc_outf(context, a, out);
 }
 
diff --git a/kernels/test/op_where_test.cpp b/kernels/test/op_where_test.cpp
index 3388e62e2f5..7ddbbef2d74 100644
--- a/kernels/test/op_where_test.cpp
+++ b/kernels/test/op_where_test.cpp
@@ -80,7 +80,7 @@ class OpWhereOutTest : public OperatorTest {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_where<DTYPE_A, DTYPE_B, ScalarType::dtype>();
 
-    ET_FORALL_FLOAT_TYPES(ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
   }
@@ -90,7 +90,7 @@ class OpWhereOutTest : public OperatorTest {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_where<DTYPE_A, ScalarType::dtype, DTYPE_A>();
 
-    ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REALHBBF16_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
   }
@@ -148,7 +148,7 @@ class OpWhereOutTest : public OperatorTest {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_where_enumerate_b_types<ScalarType::dtype>();
 
-    ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REALHBBF16_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
   }
@@ -157,7 +157,7 @@ class OpWhereOutTest : public OperatorTest {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_where<ScalarType::dtype, ScalarType::dtype, ScalarType::dtype>();
 
-    ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
   }
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 83abeeb4ab3..f8ea484435a 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -190,6 +190,7 @@ def define_common_targets():
     _common_op_test("op_clone_test", ["aten", "portable"])
     _common_op_test("op_constant_pad_nd_test", ["aten", "portable"])
     _common_op_test("op_convolution_test", ["aten", "portable"])
+    _common_op_test("op_convolution_backward_test", ["aten", "portable"])
     _common_op_test("op_copy_test", ["aten", "portable"])
     _common_op_test("op_cos_test", ["aten", "portable"])
     _common_op_test("op_cosh_test", ["aten", "portable"])
@@ -211,6 +212,7 @@ def define_common_targets():
     _common_op_test("op_fmod_test", ["aten", "portable"])
     _common_op_test("op_full_like_test", ["aten", "portable"])
     _common_op_test("op_full_test", ["aten", "portable"])
+    _common_op_test("op_gather_test", ["aten", "portable"])
     _common_op_test("op_ge_test", ["aten", "portable"])
     _common_op_test("op_gelu_test", ["aten", "portable", "optimized"])
     _common_op_test("op_glu_test", ["aten", "portable"])
@@ -224,6 +226,7 @@ def define_common_targets():
     _common_op_test("op_le_test", ["aten", "portable", "optimized"])
     _common_op_test("op_leaky_relu_test", ["aten", "portable"])
     _common_op_test("op_lift_fresh_copy_test", ["aten", "portable"])
+    _common_op_test("op_linear_test", ["aten", "optimized"])
     _common_op_test("op_log_softmax_test", ["aten", "portable", "optimized"])
     _common_op_test("op_log_test", ["aten", "portable"])
     _common_op_test("op_log10_test", ["aten", "portable"])
@@ -242,9 +245,9 @@ def define_common_targets():
     _common_op_test("op_mean_test", ["aten", "portable"])
     _common_op_test("op_min_test", ["aten", "portable"])
     _common_op_test("op_minimum_test", ["aten", "portable"])
-    _common_op_test("op_mm_test", ["aten", "portable"])
+    _common_op_test("op_mm_test", ["aten", "portable", "optimized"])
     _common_op_test("op_mul_test", ["aten", "portable", "optimized"])
-    _common_op_test("op_pow_test", ["aten", "portable"])
+    _common_op_test("op_narrow_copy_test", ["aten", "portable"])
     _common_op_test("op_native_batch_norm_test", ["aten", "portable"])
     _common_op_test("op_native_group_norm_test", ["aten", "portable"])
     _common_op_test("op_native_layer_norm_test", ["aten", "portable", "optimized"])
@@ -255,6 +258,8 @@ def define_common_targets():
     _common_op_test("op_pdist_forward_test", ["aten", "portable"])
     _common_op_test("op_permute_copy_test", ["aten", "portable"])
     _common_op_test("op_pixel_shuffle_test", ["aten", "portable"])
+    _common_op_test("op_pixel_unshuffle_test", ["aten", "portable"])
+    _common_op_test("op_pow_test", ["aten", "portable"])
     _common_op_test("op_prod_test", ["aten", "portable"])
     _common_op_test("op_reciprocal_test", ["aten", "portable"])
     _common_op_test("op_relu_test", ["aten", "portable"])
@@ -271,6 +276,7 @@ def define_common_targets():
     _common_op_test("op_rsqrt_test", ["aten", "portable"])
     _common_op_test("op_rsub_test", ["aten", "portable"])
     _common_op_test("op_scalar_tensor_test", ["aten", "portable"])
+    _common_op_test("op_scatter_test", ["aten", "portable"])
     _common_op_test("op_scatter_add_test", ["aten", "portable"])
     _common_op_test("op_select_scatter_test", ["aten", "portable"])
     _common_op_test("op_select_copy_test", ["aten", "portable"])
@@ -292,6 +298,7 @@ def define_common_targets():
     _common_op_test("op_tan_test", ["aten", "portable"])
     _common_op_test("op_tanh_test", ["aten", "portable"])
     _common_op_test("op_to_copy_test", ["aten", "portable"])
+    _common_op_test("op_topk_test", ["aten", "portable"])
     _common_op_test("op_transpose_copy_test", ["aten", "portable"])
     _common_op_test("op_tril_test", ["aten", "portable"])
     _common_op_test("op_trunc_test", ["aten", "portable"])
diff --git a/pytest.ini b/pytest.ini
index 5ed1780e611..701c0187ecf 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -13,8 +13,7 @@ addopts =
     --ignore-glob=backends/arm/**/*
     # explicitly list out tests that are running successfully in oss
     examples/models/test
-    # sdk/
-    sdk/
+    devtools/
     # examples
     examples/models/llama2/tests
     # examples/models/llava/test TODO: enable this
@@ -39,6 +38,8 @@ addopts =
     test/end2end/test_end2end.py
     --ignore=backends/xnnpack/test/ops/linear.py
     --ignore=backends/xnnpack/test/models/llama2_et_example.py
+    # T200992559: Add torchao to ET as core dependency
+    --ignore=examples/models/llama2/tests/test_spinquant_transforms.py
     --ignore=exir/backend/test/demos
     --ignore=exir/backend/test/test_backends.py
     --ignore=exir/backend/test/test_backends_lifted.py
diff --git a/runtime/backend/interface.cpp b/runtime/backend/interface.cpp
index 44785cfcd09..84c0bb82d43 100644
--- a/runtime/backend/interface.cpp
+++ b/runtime/backend/interface.cpp
@@ -7,52 +7,51 @@
  */
 
 #include <executorch/runtime/backend/interface.h>
-#include <executorch/runtime/platform/assert.h>
 
 namespace executorch {
 namespace runtime {
 
-PyTorchBackendInterface::~PyTorchBackendInterface() {}
+// Pure-virtual dtors still need an implementation.
+BackendInterface::~BackendInterface() {}
 
-// TODO(T128866626): Remove global static variables.
-// We want to be able to run multiple Executor instances
-// and having a global registration isn't a viable solution
-// in the long term.
-BackendRegistry& getBackendRegistry();
-BackendRegistry& getBackendRegistry() {
-  static BackendRegistry backend_reg;
-  return backend_reg;
-}
+namespace {
 
-PyTorchBackendInterface* get_backend_class(const char* name) {
-  return getBackendRegistry().get_backend_class(name);
-}
+// The max number of backends that can be registered globally.
+constexpr size_t kMaxRegisteredBackends = 16;
+
+// TODO(T128866626): Remove global static variables. We want to be able to run
+// multiple Executor instances and having a global registration isn't a viable
+// solution in the long term.
+
+/// Global table of registered backends.
+Backend registered_backends[kMaxRegisteredBackends];
 
-PyTorchBackendInterface* BackendRegistry::get_backend_class(const char* name) {
-  for (size_t idx = 0; idx < registrationTableSize_; idx++) {
-    Backend backend = backend_table_[idx];
-    if (strcmp(backend.name_, name) == 0) {
-      return backend.interface_ptr_;
+/// The number of backends registered in the table.
+size_t num_registered_backends = 0;
+
+} // namespace
+
+BackendInterface* get_backend_class(const char* name) {
+  for (size_t i = 0; i < num_registered_backends; i++) {
+    Backend backend = registered_backends[i];
+    if (strcmp(backend.name, name) == 0) {
+      return backend.backend;
     }
   }
   return nullptr;
 }
 
 Error register_backend(const Backend& backend) {
-  return getBackendRegistry().register_backend(backend);
-}
-
-Error BackendRegistry::register_backend(const Backend& backend) {
-  if (registrationTableSize_ >= kRegistrationTableMaxSize) {
+  if (num_registered_backends >= kMaxRegisteredBackends) {
     return Error::Internal;
   }
 
   // Check if the name already exists in the table
-  if (this->get_backend_class(backend.name_) != nullptr) {
+  if (get_backend_class(backend.name) != nullptr) {
     return Error::InvalidArgument;
   }
 
-  backend_table_[registrationTableSize_++] = backend;
+  registered_backends[num_registered_backends++] = backend;
   return Error::Ok;
 }
 
diff --git a/runtime/backend/interface.h b/runtime/backend/interface.h
index 02cbb1e8d4d..c0305f68cd3 100644
--- a/runtime/backend/interface.h
+++ b/runtime/backend/interface.h
@@ -39,9 +39,9 @@ struct CompileSpec {
  */
 using DelegateHandle = void;
 
-class PyTorchBackendInterface {
+class BackendInterface {
  public:
-  virtual ~PyTorchBackendInterface() = 0;
+  virtual ~BackendInterface() = 0;
 
   /**
    * Returns true if the backend is available to process delegation calls.
@@ -52,19 +52,19 @@ class PyTorchBackendInterface {
    * Responsible to further process (compile/transform/optimize) the compiled
    * unit that was produced, ahead-of-time, as well as perform any backend
    * initialization to ready it for execution. This method is called every time
-   * the PyTorch program is initialized. Consequently, this is the place to
+   * the ExecuTorch program is initialized. Consequently, this is the place to
    * perform any backend initialization as well as transformations,
    * optimizations, and even compilation that depend on the target device. As
    * such, it is strongly encouraged to push as much processing as possible to
    * the ahead-of-time processing.
    *
-   * @param[in] processed An opaque (to PyTorch) compiled unit from the
-   *     preprocessor. Can contain anything the backend needs to execute the
-   *     equivalent semantics of the passed-in Module and its method. Often
-   *     passed unmodified to `execute()` as a `DelegateHandle`, unless it needs
-   *     further processing at init time to be fully executable. If the data is
-   *     not needed after init(), calling processed->Free() can reclaim its
-   *     memory.
+   * @param[in] processed An opaque (to ExecuTorch) backend-specific compiled
+   *     unit from the preprocessor. Can contain anything the backend needs to
+   *     execute the equivalent semantics of the passed-in Module and its
+   *     method. Often passed unmodified to `execute()` as a `DelegateHandle`,
+   *     unless it needs further processing at init time to be fully executable.
+   *     If the data is not needed after init(), calling processed->Free() can
+   *     reclaim its memory.
    * @param[in] compile_specs The exact same compiler specification that
    *     was used ahead-of-time to produce `processed`.
    *
@@ -110,60 +110,29 @@ class PyTorchBackendInterface {
   virtual void destroy(ET_UNUSED DelegateHandle* handle) const {}
 };
 
-struct Backend {
-  const char* name_;
-  PyTorchBackendInterface* interface_ptr_;
-};
-
-// The max number of backends that can be registered in
-// an app. It's hard coded to 16 because it's not estimated
-// to have more than 16 backends in a system. Each table
-// element has two pointers, represented by Backend struct.
-// The memory overhead for this table is minimum (only a few bytes).
-constexpr size_t kRegistrationTableMaxSize = 16;
-
-class BackendRegistry {
- public:
-  BackendRegistry() : registrationTableSize_(0) {}
-
-  /**
-   * Registers the Backend object (i.e. string name and PyTorchBackendInterface
-   * pair) so that it could be called via the name during the runtime.
-   * @param[in] backend Backend object of the user-defined backend delegate.
-   * @retval Error code representing whether registration was successful.
-   */
-  ET_NODISCARD Error register_backend(const Backend& backend);
-
-  /**
-   * Returns the corresponding object pointer for a given string name.
-   * The mapping is populated using register_backend method.
-   *
-   * @param[in] name Name of the user-defined backend delegate.
-   * @retval Pointer to the appropriate object that implements
-   *         PyTorchBackendInterface. Nullptr if it can't find anything
-   *         with the given name.
-   */
-  PyTorchBackendInterface* get_backend_class(const char* name);
-
- private:
-  Backend backend_table_[kRegistrationTableMaxSize];
-  size_t registrationTableSize_;
-};
-
 /**
  * Returns the corresponding object pointer for a given string name.
  * The mapping is populated using register_backend method.
  *
  * @param[in] name Name of the user-defined backend delegate.
- * @retval Pointer to the appropriate object that implements
- *         PyTorchBackendInterface. Nullptr if it can't find anything
- *         with the given name.
+ * @retval Pointer to the appropriate object that implements BackendInterface.
+ *         Nullptr if it can't find anything with the given name.
  */
-PyTorchBackendInterface* get_backend_class(const char* name);
+BackendInterface* get_backend_class(const char* name);
+
+/**
+ * A named instance of a backend.
+ */
+struct Backend {
+  /// The name of the backend. Must match the string used in the PTE file.
+  const char* name;
+  /// The instance of the backend to use when loading and executing programs.
+  BackendInterface* backend;
+};
 
 /**
- * Registers the Backend object (i.e. string name and PyTorchBackendInterface
- * pair) so that it could be called via the name during the runtime.
+ * Registers the Backend object (i.e. string name and BackendInterface pair) so
+ * that it could be called via the name during the runtime.
  *
  * @param[in] backend Backend object
  * @retval Error code representing whether registration was successful.
@@ -178,13 +147,11 @@ namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
 using ::executorch::runtime::Backend;
-using ::executorch::runtime::BackendRegistry;
 using ::executorch::runtime::CompileSpec;
 using ::executorch::runtime::DelegateHandle;
 using ::executorch::runtime::get_backend_class;
-// using ::executorch::runtime::kRegistrationTableMaxSize;
-using ::executorch::runtime::PyTorchBackendInterface;
 using ::executorch::runtime::register_backend;
 using ::executorch::runtime::SizedBuffer;
+using PyTorchBackendInterface = ::executorch::runtime::BackendInterface;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/evalue.h b/runtime/core/evalue.h
index 8aee5f399df..d2e57c35d4d 100644
--- a/runtime/core/evalue.h
+++ b/runtime/core/evalue.h
@@ -238,6 +238,25 @@ struct EValue {
     new (&payload.as_tensor) exec_aten::Tensor(t);
   }
 
+  // Template constructor that allows construction from types that can be
+  // dereferenced to produce a type that EValue can be implicitly constructed
+  // from.
+  template <typename T>
+  /*implicit*/ EValue(
+      T&& value,
+      typename std::enable_if<std::is_convertible<
+          decltype(*std::forward<T>(value)),
+          EValue>::value>::type* = 0) {
+    ET_CHECK_MSG(value != nullptr, "Pointer is null.");
+    // Note that this ctor does not initialize this->tag directly; it is set by
+    // moving in the new value.
+    moveFrom(*std::forward<T>(value));
+  }
+
+  // Delete constructor for raw pointers to ensure they cannot be used.
+  template <typename T>
+  explicit EValue(T* value) = delete;
+
   bool isTensor() const {
     return tag == Tag::Tensor;
   }
diff --git a/runtime/core/event_tracer.h b/runtime/core/event_tracer.h
index eb8ee1fefc9..5a26d24ca45 100644
--- a/runtime/core/event_tracer.h
+++ b/runtime/core/event_tracer.h
@@ -97,7 +97,7 @@ struct EventTracerEntry {
  * EventTracer is a class that users can inherit and implement to
  * log/serialize/stream etc. the profiling and debugging events that are
  * generated at runtime for a model. An example of this is the ETDump
- * implementation in the SDK codebase that serializes these events to a
+ * implementation in the devtools codebase that serializes these events to a
  * flatbuffer.
  */
 class EventTracer {
diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h
index 919b5420b3a..207b95bc078 100644
--- a/runtime/core/exec_aten/exec_aten.h
+++ b/runtime/core/exec_aten/exec_aten.h
@@ -17,6 +17,7 @@
 #include <c10/core/MemoryFormat.h> // @manual
 #include <c10/core/Scalar.h> // @manual
 #include <c10/util/ArrayRef.h> // @manual
+#include <c10/util/BFloat16-math.h> // @manual
 #include <c10/util/BFloat16.h> // @manual
 #include <c10/util/Half.h> // @manual
 #include <c10/util/Optional.h> // @manual
@@ -31,6 +32,7 @@
 #else // use executor
 #include <executorch/runtime/core/array_ref.h> // @manual
 #include <executorch/runtime/core/portable_type/bfloat16.h> // @manual
+#include <executorch/runtime/core/portable_type/bfloat16_math.h> // @manual
 #include <executorch/runtime/core/portable_type/complex.h> // @manual
 #include <executorch/runtime/core/portable_type/device.h> // @manual
 #include <executorch/runtime/core/portable_type/half.h> // @manual
@@ -44,9 +46,10 @@
 
 #endif
 
-namespace exec_aten {
+namespace executorch {
+namespace aten {
 
-using TensorShapeDynamism = torch::executor::TensorShapeDynamism;
+using TensorShapeDynamism = executorch::runtime::TensorShapeDynamism;
 
 #ifdef USE_ATEN_LIB
 
@@ -85,6 +88,11 @@ using IntArrayRef = at::IntArrayRef;
 template <typename T>
 using OptionalArrayRef = c10::OptionalArrayRef<T>;
 
+inline ssize_t compute_numel(const SizesType* sizes, ssize_t dim) {
+  return static_cast<ssize_t>(
+      c10::multiply_integers(c10::ArrayRef<SizesType>(sizes, dim)));
+}
+
 #else // Use executor types
 
 using Tensor = torch::executor::Tensor;
@@ -125,12 +133,19 @@ template <typename T>
 using OptionalArrayRef =
     torch::executor::optional<torch::executor::ArrayRef<T>>;
 
-#endif // Use executor types
+using torch::executor::compute_numel;
+
+#endif // Use ExecuTorch types
+
+} // namespace aten
+} // namespace executorch
+
+// DEPRECATED: The exec_aten:: namespace is deprecated. Use executorch::aten::
+// instead.
+namespace exec_aten = executorch::aten;
 
-} // namespace exec_aten
 namespace torch {
 namespace executor {
 using TensorList = exec_aten::TensorList;
-
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/exec_aten/testing_util/targets.bzl b/runtime/core/exec_aten/testing_util/targets.bzl
index 3b0e78f03bc..d3219c58839 100644
--- a/runtime/core/exec_aten/testing_util/targets.bzl
+++ b/runtime/core/exec_aten/testing_util/targets.bzl
@@ -23,15 +23,16 @@ def define_common_targets():
                 # list.
                 "//executorch/runtime/core/exec_aten/util/test/...",
                 "//executorch/runtime/core/exec_aten/testing_util/test/...",
+                "//executorch/runtime/core/portable_type/test/...",
                 "//executorch/kernels/prim_ops/test/...",
                 "//executorch/kernels/portable/test/...",
                 "//executorch/kernels/portable/cpu/util/test/...",
                 "//executorch/kernels/quantized/test/...",
                 "//executorch/kernels/optimized/test/...",
                 "//executorch/kernels/test/...",
+                "//executorch/kernels/fb/custom_ops/...",
                 "//executorch/runtime/core/test/...",
                 "//executorch/test/...",
-                "//executorch/util/...",
                 "//executorch/backends/fb/qnnpack/test/...",
                 "//executorch/extension/kernel_util/test/...",
                 "@EXECUTORCH_CLIENTS",
diff --git a/runtime/core/exec_aten/testing_util/tensor_factory.h b/runtime/core/exec_aten/testing_util/tensor_factory.h
index 8f39cc9911d..3045af55819 100644
--- a/runtime/core/exec_aten/testing_util/tensor_factory.h
+++ b/runtime/core/exec_aten/testing_util/tensor_factory.h
@@ -3,8 +3,10 @@
 #pragma once
 
 #include <algorithm>
+#include <cstdint>
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/tensor_shape_dynamism.h>
 #include <executorch/runtime/platform/assert.h>
@@ -54,7 +56,7 @@ inline size_t sizes_to_numel(const std::vector<int32_t>& sizes) {
 
 inline bool check_strides(
     const std::vector<int32_t> sizes,
-    const std::vector<int32_t> strides) {
+    const std::vector<exec_aten::StridesType> strides) {
   if (sizes.size() != strides.size()) {
     // The length of stride vector shall equal to size vector.
     return false;
@@ -147,14 +149,14 @@ inline bool check_dim_order(
   return true;
 }
 
-inline std::vector<int32_t> strides_from_dim_order(
+inline std::vector<exec_aten::StridesType> strides_from_dim_order(
     const std::vector<int32_t>& sizes,
     const std::vector<uint8_t>& dim_order) {
   bool legal = check_dim_order(sizes, dim_order);
   ET_CHECK_MSG(legal, "The input dim_order variable is illegal.");
 
   size_t ndim = sizes.size();
-  std::vector<int32_t> strides(ndim);
+  std::vector<exec_aten::StridesType> strides(ndim);
   strides[dim_order[ndim - 1]] = 1;
   for (int i = ndim - 2; i >= 0; --i) {
     uint8_t cur_dim = dim_order[i];
@@ -258,7 +260,7 @@ class TensorFactory {
   at::Tensor make(
       const std::vector<int32_t>& sizes,
       const std::vector<ctype>& data,
-      const std::vector<int32_t> strides = {},
+      const std::vector<exec_aten::StridesType> strides = {},
       ET_UNUSED TensorShapeDynamism dynamism =
           TensorShapeDynamism::DYNAMIC_UNBOUND) {
     auto expected_numel = internal::sizes_to_numel(sizes);
@@ -344,6 +346,72 @@ class TensorFactory {
         sizes, data, internal::channels_last_dim_order(sizes.size()), dynamism);
   }
 
+  /**
+   * Given data in contiguous memory format, returns a new Tensor with the
+   * specified shape and the same data but in channels last memory format.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] data The data in contiguous memory format that the Tensor should
+   * be initialized with. The size of this vector must be equal to the product
+   * of the elements of `sizes`.
+   *
+   * @return A new Tensor with the specified shape and data in channls last
+   * memory format.
+   */
+  at::Tensor channels_last_like(
+      const at::Tensor& input,
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    ET_CHECK_MSG(
+        input.sizes().size() == 4, "Only 4D tensors can be channels last");
+
+    const std::vector<int32_t> sizes(
+        input.sizes().begin(), input.sizes().end());
+
+    std::vector<uint8_t> contiguous_dim_order(sizes.size());
+    for (uint8_t i = 0; i < sizes.size(); i++) {
+      contiguous_dim_order[i] = i;
+    }
+    std::vector<exec_aten::StridesType> contiguous_strides =
+        internal::strides_from_dim_order(sizes, contiguous_dim_order);
+
+    for (int32_t i = 0; i < input.dim(); i++) {
+      ET_CHECK_MSG(
+          input.strides()[i] == contiguous_strides[i],
+          "Input tensor is not contiguous");
+    }
+
+    int32_t N = sizes[0];
+    int32_t C = sizes[1];
+    int32_t H = sizes[2];
+    int32_t W = sizes[3];
+
+    std::vector<ctype> contiguous_data(
+        input.data_ptr<ctype>(), input.data_ptr<ctype>() + input.numel());
+    std::vector<ctype> channels_last_data(
+        N * C * H * W); // Create a new blob with the same total size to contain
+                        // channels_last data
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t c = 0; c < C; ++c) {
+        for (int32_t h = 0; h < H; ++h) {
+          for (int32_t w = 0; w < W; ++w) {
+            // Calculate the index in the original blob
+            int32_t old_index = ((n * C + c) * H + h) * W + w;
+            // Calculate the index in the new blob
+            int32_t new_index = ((n * H + h) * W + w) * C + c;
+            // Copy the data
+            channels_last_data[new_index] = contiguous_data[old_index];
+          }
+        }
+      }
+    }
+
+    return make_with_dimorder(
+        sizes,
+        channels_last_data,
+        internal::channels_last_dim_order(sizes.size()),
+        dynamism);
+  }
+
   /**
    * Returns a new Tensor with the specified shape, containing contiguous
    * data will all elements set to `value`.
@@ -459,14 +527,13 @@ class TensorFactory {
    */
   at::Tensor empty_strided(
       const std::vector<int32_t>& sizes,
-      const std::vector<int32_t>& strides,
+      const std::vector<exec_aten::StridesType>& strides,
       ET_UNUSED TensorShapeDynamism dynamism =
           TensorShapeDynamism::DYNAMIC_UNBOUND) {
     auto sizes64 = vec_32_to_64(sizes);
-    auto strides64 = vec_32_to_64(strides);
     return at::empty_strided(
         sizes64,
-        strides64,
+        strides,
         DTYPE,
         /*layout_opt=*/at::Layout::Strided,
         /*device_opt=*/at::Device(at::DeviceType::CPU),
@@ -666,7 +733,7 @@ class TensorFactory {
   torch::executor::Tensor make(
       const std::vector<int32_t>& sizes,
       const std::vector<ctype>& data,
-      const std::vector<int32_t> strides = {},
+      const std::vector<exec_aten::StridesType> strides = {},
       TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
     std::vector<int32_t> default_strides;
     // Generate strides from the tensor dimensions, assuming contiguous data if
@@ -746,7 +813,7 @@ class TensorFactory {
 
   /**
    * Returns a new Tensor with the specified shape and data in channels last
-   * memory layout.
+   * memory format.
    *
    * @param[in] sizes The sizes of the dimensions of the Tensor.
    * @param[in] data The data that the Tensor should be initialized with. The
@@ -764,6 +831,60 @@ class TensorFactory {
         sizes, data, internal::channels_last_dim_order(sizes.size()), dynamism);
   }
 
+  /**
+   * Given data in contiguous memory format, returns a new Tensor with the
+   * specified shape and the same data but in channels last memory format.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @param[in] data The data in contiguous memory format that the Tensor should
+   * be initialized with. The size of this vector must be equal to the product
+   * of the elements of `sizes`.
+   *
+   * @return A new Tensor with the specified shape and data in channls last
+   * memory format.
+   */
+  torch::executor::Tensor channels_last_like(
+      const torch::executor::Tensor& input,
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    const std::vector<int32_t> sizes(
+        input.sizes().begin(), input.sizes().end());
+
+    ET_CHECK_MSG(sizes.size() == 4, "Only 4D tensors can be channels last");
+    ET_CHECK_MSG(
+        is_contiguous_dim_order(input.dim_order().data(), input.dim()) == true,
+        "Input tensor is not contiguous");
+    int32_t N = sizes[0];
+    int32_t C = sizes[1];
+    int32_t H = sizes[2];
+    int32_t W = sizes[3];
+
+    std::vector<ctype> contiguous_data(
+        input.data_ptr<ctype>(), input.data_ptr<ctype>() + input.numel());
+    std::vector<ctype> channels_last_data(
+        N * C * H * W); // Create a new blob with the same total size to contain
+                        // channels_last data
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t c = 0; c < C; ++c) {
+        for (int32_t h = 0; h < H; ++h) {
+          for (int32_t w = 0; w < W; ++w) {
+            // Calculate the index in the original blob
+            int32_t old_index = ((n * C + c) * H + h) * W + w;
+            // Calculate the index in the new blob
+            int32_t new_index = ((n * H + h) * W + w) * C + c;
+            // Copy the data
+            channels_last_data[new_index] = contiguous_data[old_index];
+          }
+        }
+      }
+    }
+
+    return make_with_dimorder(
+        sizes,
+        channels_last_data,
+        internal::channels_last_dim_order(sizes.size()),
+        dynamism);
+  }
+
   /**
    * Returns a new Tensor with the specified shape, containing contiguous data
    * will all elements set to `value`.
@@ -799,7 +920,20 @@ class TensorFactory {
 
   /**
    * Returns a new Tensor with the specified shape, containing contiguous data
-   * with all `0` elements.
+   * in channels last memory format with all `0` elements.
+   *
+   * @param[in] sizes The sizes of the dimensions of the Tensor.
+   * @return A new Tensor with the specified shape.
+   */
+  torch::executor::Tensor zeros_channels_last(
+      const std::vector<int32_t>& sizes,
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    return full_channels_last(sizes, 0, dynamism);
+  }
+
+  /**
+   * Returns a new Tensor with the specified shape, containing contiguous data
+   * in contiguous memory format with all `0` elements.
    *
    * @param[in] sizes The sizes of the dimensions of the Tensor.
    * @return A new Tensor with the specified shape.
@@ -878,7 +1012,7 @@ class TensorFactory {
     std::vector<int32_t> sizes_;
     std::vector<ctype> data_;
     std::vector<uint8_t> dim_order_;
-    std::vector<int32_t> strides_;
+    std::vector<exec_aten::StridesType> strides_;
     torch::executor::TensorImpl impl_;
   };
 
diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp
index f0340d34ca2..0301cc9a519 100644
--- a/runtime/core/exec_aten/testing_util/tensor_util.cpp
+++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp
@@ -16,6 +16,8 @@
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
+using exec_aten::BFloat16;
+using exec_aten::Half;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
 
@@ -32,15 +34,22 @@ namespace {
  * T must be a floating point type. Non-floating point data should be compared
  * directly.
  */
-template <
-    typename T,
-    typename = std::enable_if_t<std::is_floating_point<T>::value>>
+template <typename T>
 bool data_is_close(
     const T* a,
     const T* b,
     size_t numel,
     double rtol,
     double atol) {
+  ET_CHECK_MSG(
+      numel == 0 || (a != nullptr && b != nullptr),
+      "Pointers must not be null when numel > 0: numel %zu, a 0x%p, b 0x%p",
+      numel,
+      a,
+      b);
+  if (a == b) {
+    return true;
+  }
   for (size_t i = 0; i < numel; i++) {
     const auto ai = a[i];
     const auto bi = b[i];
@@ -110,6 +119,20 @@ bool tensors_are_close(
         a.numel(),
         rtol,
         atol);
+  } else if (a.scalar_type() == ScalarType::Half) {
+    return data_is_close<Half>(
+        a.const_data_ptr<Half>(),
+        b.const_data_ptr<Half>(),
+        a.numel(),
+        rtol,
+        atol);
+  } else if (a.scalar_type() == ScalarType::BFloat16) {
+    return data_is_close<BFloat16>(
+        a.const_data_ptr<BFloat16>(),
+        b.const_data_ptr<BFloat16>(),
+        a.numel(),
+        rtol,
+        atol);
   } else {
     // Non-floating-point types can be compared bitwise.
     return memcmp(a.const_data_ptr(), b.const_data_ptr(), a.nbytes()) == 0;
@@ -260,7 +283,7 @@ std::ostream& operator<<(std::ostream& os, const Tensor& t) {
     break;
 
   switch (t.scalar_type()) {
-    ET_FORALL_REAL_TYPES_AND2(Half, Bool, PRINT_CASE)
+    ET_FORALL_REAL_TYPES_AND3(Half, Bool, BFloat16, PRINT_CASE)
     default:
       ET_CHECK_MSG(
           false,
diff --git a/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp b/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp
index a2bc36f4814..8681e9553a6 100644
--- a/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp
+++ b/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp
@@ -449,7 +449,7 @@ TEST_F(TensorFactoryTest, MakeStridedDataIsCopied) {
 
   // Create two tensors using the same input data and strided vector.
   std::vector<int32_t> data = {1, 2, 3, 4};
-  std::vector<int32_t> strides = {1, 2};
+  std::vector<exec_aten::StridesType> strides = {1, 2};
   Tensor t1 = tf.make(/*sizes=*/{2, 2}, data, strides);
   Tensor t2 = tf.make(/*sizes=*/{2, 2}, data, strides);
 
diff --git a/runtime/core/exec_aten/testing_util/test/tensor_util_test.cpp b/runtime/core/exec_aten/testing_util/test/tensor_util_test.cpp
index 6d4ce5a8532..948f6bc78f0 100644
--- a/runtime/core/exec_aten/testing_util/test/tensor_util_test.cpp
+++ b/runtime/core/exec_aten/testing_util/test/tensor_util_test.cpp
@@ -23,6 +23,7 @@
 using namespace ::testing;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using exec_aten::TensorImpl;
 using exec_aten::TensorList;
 using executorch::runtime::testing::IsCloseTo;
 using executorch::runtime::testing::IsDataCloseTo;
@@ -826,4 +827,22 @@ TEST(TensorUtilTest, TensorStreamBool) {
       "ETensor(sizes={2, 2}, dtype=Bool, data={1, 0, 1, 0})");
 }
 
+TEST(TensorTest, TestZeroShapeTensorEquality) {
+  TensorImpl::SizesType sizes[2] = {2, 2};
+  TensorImpl::StridesType strides[2] = {2, 1};
+  TensorImpl::DimOrderType dim_order[2] = {0, 1};
+
+  TensorImpl t1(ScalarType::Float, 2, sizes, nullptr, dim_order, strides);
+  TensorImpl t2(ScalarType::Float, 2, sizes, nullptr, dim_order, strides);
+
+  ET_EXPECT_DEATH({ EXPECT_TENSOR_EQ(Tensor(&t1), Tensor(&t2)); }, "");
+
+  float data[] = {1.0, 2.0, 3.0, 4.0};
+
+  t1.set_data(data);
+  t2.set_data(data);
+
+  EXPECT_TENSOR_EQ(Tensor(&t1), Tensor(&t2));
+}
+
 #endif // !USE_ATEN_LIB
diff --git a/runtime/core/exec_aten/util/genScalarTypeTable.py b/runtime/core/exec_aten/util/genScalarTypeTable.py
index 07100472ae4..c2bc84c2700 100644
--- a/runtime/core/exec_aten/util/genScalarTypeTable.py
+++ b/runtime/core/exec_aten/util/genScalarTypeTable.py
@@ -4,20 +4,35 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-indexToType = ["U1", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "B1"]
+indexToType = [
+    "U1",
+    "I1",
+    "I2",
+    "I4",
+    "I8",
+    "F2",
+    "F4",
+    "F8",
+    "C2",
+    "C4",
+    "C8",
+    "B1",
+    "BF",
+]
 promoteTypesLookup = [
-    ["U1", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "U1"],
-    ["I2", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I1"],
-    ["I2", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I2"],
-    ["I4", "I4", "I4", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I4"],
-    ["I8", "I8", "I8", "I8", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I8"],
-    ["F2", "F2", "F2", "F2", "F2", "F2", "F4", "F8", "C2", "C4", "C8", "F2"],
-    ["F4", "F4", "F4", "F4", "F4", "F4", "F4", "F8", "C4", "C4", "C8", "F4"],
-    ["F8", "F8", "F8", "F8", "F8", "F8", "F8", "F8", "C8", "C8", "C8", "F8"],
-    ["C2", "C2", "C2", "C2", "C2", "C2", "C4", "C8", "C2", "C4", "C8", "C2"],
-    ["C4", "C4", "C4", "C4", "C4", "C4", "C4", "C8", "C4", "C4", "C8", "C4"],
-    ["C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8"],
-    ["U1", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "B1"],
+    ["U1", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "U1", "BF"],
+    ["I2", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I1", "BF"],
+    ["I2", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I2", "BF"],
+    ["I4", "I4", "I4", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I4", "BF"],
+    ["I8", "I8", "I8", "I8", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I8", "BF"],
+    ["F2", "F2", "F2", "F2", "F2", "F2", "F4", "F8", "C2", "C4", "C8", "F2", "F4"],
+    ["F4", "F4", "F4", "F4", "F4", "F4", "F4", "F8", "C4", "C4", "C8", "F4", "F4"],
+    ["F8", "F8", "F8", "F8", "F8", "F8", "F8", "F8", "C8", "C8", "C8", "F8", "F8"],
+    ["C2", "C2", "C2", "C2", "C2", "C2", "C4", "C8", "C2", "C4", "C8", "C2", "C4"],
+    ["C4", "C4", "C4", "C4", "C4", "C4", "C4", "C8", "C4", "C4", "C8", "C4", "C4"],
+    ["C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8"],
+    ["U1", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "B1", "BF"],
+    ["BF", "BF", "BF", "BF", "BF", "F4", "F4", "F8", "C4", "C4", "C8", "BF", "BF"],
 ]
 for rowIndex, row in enumerate(promoteTypesLookup):
     for colIndex, col in enumerate(row):
diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h
index c92f910431f..e25c5e36920 100644
--- a/runtime/core/exec_aten/util/scalar_type_util.h
+++ b/runtime/core/exec_aten/util/scalar_type_util.h
@@ -21,12 +21,14 @@
 
 #pragma once
 
+#include <array>
 #include <cinttypes>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
 
 #include <executorch/runtime/platform/assert.h>
+
 #ifdef USE_ATEN_LIB
 // Note that a lot of the macros/functions defined in this ScalarTypeUtil.h file
 // are also defined in c10/core/ScalarType.h, which is included via
@@ -35,17 +37,24 @@
 // here.
 #define ET_FORALL_SCALAR_TYPES AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS
 #include <c10/core/ScalarType.h>
-namespace exec_aten {
+namespace executorch {
+namespace aten {
 using ScalarType = at::ScalarType;
-}
-#else
+} // namespace aten
+} // namespace executorch
+#else // !USE_ATEN_LIB
 #include <executorch/runtime/core/portable_type/scalar_type.h>
 #include <executorch/runtime/core/portable_type/string_view.h>
-namespace exec_aten {
+namespace executorch {
+namespace aten {
 using ScalarType = torch::executor::ScalarType;
 using string_view = torch::executor::string_view;
-} // namespace exec_aten
-#endif
+} // namespace aten
+} // namespace executorch
+#endif // USE_ATEN_LIB
+// DEPRECATED: The exec_aten:: namespace is deprecated. Use executorch::aten::
+// instead.
+namespace exec_aten = ::executorch::aten;
 
 namespace executorch {
 namespace runtime {
@@ -71,16 +80,20 @@ struct is_reduced_floating_point
           bool,
           std::is_same<T, torch::executor::Half>::value ||
               std::is_same<T, torch::executor::BFloat16>::value> {};
+
+template <typename T>
+constexpr bool is_reduced_floating_point_v =
+    is_reduced_floating_point<T>::value;
 #endif
 
 /// Maps ScalarTypes to C++ types.
-template <exec_aten::ScalarType N>
+template <::executorch::aten::ScalarType N>
 struct ScalarTypeToCppType;
 
-#define SPECIALIZE_ScalarTypeToCppType(cpp_type, scalar_type)      \
-  template <>                                                      \
-  struct ScalarTypeToCppType<exec_aten::ScalarType::scalar_type> { \
-    using type = cpp_type;                                         \
+#define SPECIALIZE_ScalarTypeToCppType(cpp_type, scalar_type)               \
+  template <>                                                               \
+  struct ScalarTypeToCppType<::executorch::aten::ScalarType::scalar_type> { \
+    using type = cpp_type;                                                  \
   };
 
 ET_FORALL_SCALAR_TYPES(SPECIALIZE_ScalarTypeToCppType)
@@ -95,8 +108,8 @@ struct CppTypeToScalarType;
   template <>                                                 \
   struct CppTypeToScalarType<cpp_type>                        \
       : std::integral_constant<                               \
-            exec_aten::ScalarType,                            \
-            exec_aten::ScalarType::scalar_type> {};
+            ::executorch::aten::ScalarType,                   \
+            ::executorch::aten::ScalarType::scalar_type> {};
 
 ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
 
@@ -141,14 +154,14 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, int32_t, Int)                    \
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, int64_t, Long)
 
-#define ET_FORALL_INT_TYPES_AND(SCALARTYPE, _)      \
-  _(uint8_t, Byte)                                  \
-  _(int8_t, Char)                                   \
-  _(int16_t, Short)                                 \
-  _(int32_t, Int)                                   \
-  _(int64_t, Long)                                  \
-  _(::executorch::runtime::ScalarTypeToCppType<     \
-        ::exec_aten::ScalarType::SCALARTYPE>::type, \
+#define ET_FORALL_INT_TYPES_AND(SCALARTYPE, _)             \
+  _(uint8_t, Byte)                                         \
+  _(int8_t, Char)                                          \
+  _(int16_t, Short)                                        \
+  _(int32_t, Int)                                          \
+  _(int64_t, Long)                                         \
+  _(::executorch::runtime::ScalarTypeToCppType<            \
+        ::executorch::aten::ScalarType::SCALARTYPE>::type, \
     SCALARTYPE)
 
 // In this context, "FLOAT" means float C types, which is why BFloat16 is not
@@ -157,15 +170,28 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(float, Float)                \
   _(double, Double)
 
-#define ET_FORALL_FLOAT_TYPES_AND(SCALARTYPE, _)    \
-  _(float, Float)                                   \
-  _(double, Double)                                 \
-  _(::executorch::runtime::ScalarTypeToCppType<     \
-        ::exec_aten::ScalarType::SCALARTYPE>::type, \
+#define ET_FORALL_FLOAT_TYPES_AND(SCALARTYPE, _)           \
+  _(float, Float)                                          \
+  _(double, Double)                                        \
+  _(::executorch::runtime::ScalarTypeToCppType<            \
+        ::executorch::aten::ScalarType::SCALARTYPE>::type, \
     SCALARTYPE)
 
+#define ET_FORALL_FLOAT_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
+  _(float, Float)                                               \
+  _(double, Double)                                             \
+  _(::executorch::runtime::ScalarTypeToCppType<                 \
+        ::executorch::aten::ScalarType::SCALARTYPE1>::type,     \
+    SCALARTYPE1)                                                \
+  _(::executorch::runtime::ScalarTypeToCppType<                 \
+        ::executorch::aten::ScalarType::SCALARTYPE2>::type,     \
+    SCALARTYPE2)
+
 #define ET_FORALL_FLOATH_TYPES(_) ET_FORALL_FLOAT_TYPES_AND(Half, _)
 
+#define ET_FORALL_FLOATHBF16_TYPES(_) \
+  ET_FORALL_FLOAT_TYPES_AND2(Half, BFloat16, _)
+
 // Here `ANOTHER_INPUT` should be another variable to be forwarded to a given
 // function. Not to be confused with another scalar type as in
 // `ET_FORALL_FLOAT_TYPES_AND`.
@@ -177,6 +203,12 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                      \
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)
 
+#define ET_FORALL_FLOATHBF16_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                           \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)                         \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::executorch::aten::Half, Half)         \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::executorch::aten::BFloat16, BFloat16)
+
 // In this context, "REAL" means integer/float C types, which is why BFloat16
 // and Half are not included.
 #define ET_FORALL_REAL_TYPES(_) \
@@ -209,22 +241,54 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                     \
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)
 
+#define ET_FORALL_REALHBF16_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, uint8_t, Byte)                         \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int8_t, Char)                          \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int16_t, Short)                        \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int32_t, Int)                          \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int64_t, Long)                         \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                          \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)                        \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::executorch::aten::Half, Half)        \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::executorch::aten::BFloat16, BFloat16)
+
 // For macros that take `SCALARTYPEn` parameters, those parameters should be
 // an unquoted/unqualified enumerator name like `Int` or `Float`.
-#define ET_FORALL_REAL_TYPES_AND(SCALARTYPE, _)     \
-  _(uint8_t, Byte)                                  \
-  _(int8_t, Char)                                   \
-  _(int16_t, Short)                                 \
-  _(int32_t, Int)                                   \
-  _(int64_t, Long)                                  \
-  _(float, Float)                                   \
-  _(double, Double)                                 \
-  _(::executorch::runtime::ScalarTypeToCppType<     \
-        ::exec_aten::ScalarType::SCALARTYPE>::type, \
+#define ET_FORALL_REAL_TYPES_AND(SCALARTYPE, _)            \
+  _(uint8_t, Byte)                                         \
+  _(int8_t, Char)                                          \
+  _(int16_t, Short)                                        \
+  _(int32_t, Int)                                          \
+  _(int64_t, Long)                                         \
+  _(float, Float)                                          \
+  _(double, Double)                                        \
+  _(::executorch::runtime::ScalarTypeToCppType<            \
+        ::executorch::aten::ScalarType::SCALARTYPE>::type, \
     SCALARTYPE)
 
+#define ET_FORALL_REAL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
+  _(uint8_t, Byte)                                             \
+  _(int8_t, Char)                                              \
+  _(int16_t, Short)                                            \
+  _(int32_t, Int)                                              \
+  _(int64_t, Long)                                             \
+  _(float, Float)                                              \
+  _(double, Double)                                            \
+  _(::executorch::runtime::ScalarTypeToCppType<                \
+        ::executorch::aten::ScalarType::SCALARTYPE1>::type,    \
+    SCALARTYPE1)                                               \
+  _(::executorch::runtime::ScalarTypeToCppType<                \
+        ::executorch::aten::ScalarType::SCALARTYPE2>::type,    \
+    SCALARTYPE2)
+
 #define ET_FORALL_REALH_TYPES(_) ET_FORALL_REAL_TYPES_AND(Half, _)
 
+#define ET_FORALL_REALHBF16_TYPES(_) \
+  ET_FORALL_REAL_TYPES_AND2(Half, BFloat16, _)
+
+#define ET_FORALL_REALHBBF16_TYPES(_) \
+  ET_FORALL_REAL_TYPES_AND3(Bool, Half, BFloat16, _)
+
 #define ET_FORALL_REAL_TYPES_AND_WITH(SCALARTYPE, ANOTHER_INPUT, _) \
   _(ANOTHER_INPUT, uint8_t, Byte)                                   \
   _(ANOTHER_INPUT, int8_t, Char)                                    \
@@ -235,7 +299,7 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(ANOTHER_INPUT, double, Double)                                  \
   _(ANOTHER_INPUT,                                                  \
     ::executorch::runtime::ScalarTypeToCppType<                     \
-        ::exec_aten::ScalarType::SCALARTYPE>::type,                 \
+        ::executorch::aten::ScalarType::SCALARTYPE>::type,          \
     SCALARTYPE)
 
 #define ET_FORALL_REAL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
@@ -247,10 +311,10 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(float, Float)                                              \
   _(double, Double)                                            \
   _(::executorch::runtime::ScalarTypeToCppType<                \
-        ::exec_aten::ScalarType::SCALARTYPE1>::type,           \
+        ::executorch::aten::ScalarType::SCALARTYPE1>::type,    \
     SCALARTYPE1)                                               \
   _(::executorch::runtime::ScalarTypeToCppType<                \
-        ::exec_aten::ScalarType::SCALARTYPE2>::type,           \
+        ::executorch::aten::ScalarType::SCALARTYPE2>::type,    \
     SCALARTYPE2)
 
 #define ET_FORALL_REAL_TYPES_AND3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, _) \
@@ -262,13 +326,13 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(float, Float)                                                           \
   _(double, Double)                                                         \
   _(::executorch::runtime::ScalarTypeToCppType<                             \
-        ::exec_aten::ScalarType::SCALARTYPE1>::type,                        \
+        ::executorch::aten::ScalarType::SCALARTYPE1>::type,                 \
     SCALARTYPE1)                                                            \
   _(::executorch::runtime::ScalarTypeToCppType<                             \
-        ::exec_aten::ScalarType::SCALARTYPE2>::type,                        \
+        ::executorch::aten::ScalarType::SCALARTYPE2>::type,                 \
     SCALARTYPE2)                                                            \
   _(::executorch::runtime::ScalarTypeToCppType<                             \
-        ::exec_aten::ScalarType::SCALARTYPE3>::type,                        \
+        ::executorch::aten::ScalarType::SCALARTYPE3>::type,                 \
     SCALARTYPE3)
 
 #define ET_FORALL_QINT_TYPES(_)            \
@@ -292,10 +356,10 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
  * Returns true if the parameter is one of the values covered by
  * ET_FORALL_SCALAR_TYPES.
  */
-inline bool isValid(exec_aten::ScalarType type) {
+inline bool isValid(::executorch::aten::ScalarType type) {
   return static_cast<int8_t>(type) >= 0 &&
-      type < exec_aten::ScalarType::NumOptions &&
-      type != exec_aten::ScalarType::Undefined;
+      type < ::executorch::aten::ScalarType::NumOptions &&
+      type != ::executorch::aten::ScalarType::Undefined;
 }
 
 /**
@@ -304,14 +368,14 @@ inline bool isValid(exec_aten::ScalarType type) {
  * @param[in] t The type to get the name of.
  * @return The name of the type, or "UNKNOWN_SCALAR" if the type is not known.
  */
-inline const char* toString(exec_aten::ScalarType t) {
-#define DEFINE_CASE(_, name)        \
-  case exec_aten::ScalarType::name: \
+inline const char* toString(::executorch::aten::ScalarType t) {
+#define DEFINE_CASE(_, name)                 \
+  case ::executorch::aten::ScalarType::name: \
     return #name;
 
   switch (t) {
     ET_FORALL_SCALAR_TYPES(DEFINE_CASE)
-    case exec_aten::ScalarType::Undefined:
+    case ::executorch::aten::ScalarType::Undefined:
       return "Undefined";
     default:
       return "UNKNOWN_SCALAR";
@@ -327,9 +391,9 @@ inline const char* toString(exec_aten::ScalarType t) {
  * @param[in] t The type to get the underlying C type size of.
  * @return The size of the associated C type in bytes.
  */
-inline size_t elementSize(exec_aten::ScalarType t) {
-#define CASE_ELEMENTSIZE_CASE(ctype, name) \
-  case exec_aten::ScalarType::name:        \
+inline size_t elementSize(::executorch::aten::ScalarType t) {
+#define CASE_ELEMENTSIZE_CASE(ctype, name)   \
+  case ::executorch::aten::ScalarType::name: \
     return sizeof(ctype);
 
   switch (t) {
@@ -341,12 +405,14 @@ inline size_t elementSize(exec_aten::ScalarType t) {
 }
 
 inline constexpr bool isIntegralType(
-    exec_aten::ScalarType t,
+    ::executorch::aten::ScalarType t,
     bool includeBool) {
-  return (includeBool && t == exec_aten::ScalarType::Bool) ||
-      (t == exec_aten::ScalarType::Byte || t == exec_aten::ScalarType::Char ||
-       t == exec_aten::ScalarType::Int || t == exec_aten::ScalarType::Long ||
-       t == exec_aten::ScalarType::Short);
+  return (includeBool && t == ::executorch::aten::ScalarType::Bool) ||
+      (t == ::executorch::aten::ScalarType::Byte ||
+       t == ::executorch::aten::ScalarType::Char ||
+       t == ::executorch::aten::ScalarType::Int ||
+       t == ::executorch::aten::ScalarType::Long ||
+       t == ::executorch::aten::ScalarType::Short);
 }
 
 template <typename T, bool includeBool>
@@ -355,37 +421,50 @@ struct is_integral_type
           bool,
           isIntegralType(CppTypeToScalarType<T>::value, includeBool)> {};
 
-inline constexpr bool isFloatingType(exec_aten::ScalarType t) {
+inline constexpr bool isFloatingType(::executorch::aten::ScalarType t) {
   return (
-      t == exec_aten::ScalarType::Double || t == exec_aten::ScalarType::Float ||
-      t == exec_aten::ScalarType::Half || t == exec_aten::ScalarType::BFloat16);
+      t == ::executorch::aten::ScalarType::Double ||
+      t == ::executorch::aten::ScalarType::Float ||
+      t == ::executorch::aten::ScalarType::Half ||
+      t == ::executorch::aten::ScalarType::BFloat16);
 }
 
-inline bool isRealType(exec_aten::ScalarType t) {
+inline bool isRealType(::executorch::aten::ScalarType t) {
   return (
-      t == exec_aten::ScalarType::Byte || t == exec_aten::ScalarType::Char ||
-      t == exec_aten::ScalarType::Short || t == exec_aten::ScalarType::Int ||
-      t == exec_aten::ScalarType::Long || t == exec_aten::ScalarType::Float ||
-      t == exec_aten::ScalarType::Double);
+      t == ::executorch::aten::ScalarType::Byte ||
+      t == ::executorch::aten::ScalarType::Char ||
+      t == ::executorch::aten::ScalarType::Short ||
+      t == ::executorch::aten::ScalarType::Int ||
+      t == ::executorch::aten::ScalarType::Long ||
+      t == ::executorch::aten::ScalarType::Float ||
+      t == ::executorch::aten::ScalarType::Double);
 }
 
-inline bool isRealHType(exec_aten::ScalarType t) {
+inline bool isRealHType(::executorch::aten::ScalarType t) {
   return (
-      t == exec_aten::ScalarType::Byte || t == exec_aten::ScalarType::Char ||
-      t == exec_aten::ScalarType::Short || t == exec_aten::ScalarType::Int ||
-      t == exec_aten::ScalarType::Long || t == exec_aten::ScalarType::Float ||
-      t == exec_aten::ScalarType::Double || t == exec_aten::ScalarType::Half);
+      t == ::executorch::aten::ScalarType::Byte ||
+      t == ::executorch::aten::ScalarType::Char ||
+      t == ::executorch::aten::ScalarType::Short ||
+      t == ::executorch::aten::ScalarType::Int ||
+      t == ::executorch::aten::ScalarType::Long ||
+      t == ::executorch::aten::ScalarType::Float ||
+      t == ::executorch::aten::ScalarType::Double ||
+      t == ::executorch::aten::ScalarType::Half);
 }
 
-inline bool isRealHBType(exec_aten::ScalarType t) {
-  return (isRealHType(t) || t == exec_aten::ScalarType::Bool);
+inline bool isRealHBType(::executorch::aten::ScalarType t) {
+  return (isRealHType(t) || t == ::executorch::aten::ScalarType::Bool);
 }
 
-inline constexpr bool isComplexType(exec_aten::ScalarType t) {
+inline bool isRealHBBF16Type(::executorch::aten::ScalarType t) {
+  return (isRealHBType(t) || t == ::executorch::aten::ScalarType::BFloat16);
+}
+
+inline constexpr bool isComplexType(::executorch::aten::ScalarType t) {
   return (
-      t == exec_aten::ScalarType::ComplexHalf ||
-      t == exec_aten::ScalarType::ComplexFloat ||
-      t == exec_aten::ScalarType::ComplexDouble);
+      t == ::executorch::aten::ScalarType::ComplexHalf ||
+      t == ::executorch::aten::ScalarType::ComplexFloat ||
+      t == ::executorch::aten::ScalarType::ComplexDouble);
 }
 
 template <typename T>
@@ -393,11 +472,12 @@ struct is_complex_type : std::integral_constant<
                              bool,
                              isComplexType(CppTypeToScalarType<T>::value)> {};
 
-constexpr bool isBitsType(exec_aten::ScalarType t) {
-  return t == exec_aten::ScalarType::Bits1x8 ||
-      t == exec_aten::ScalarType::Bits2x4 ||
-      t == exec_aten::ScalarType::Bits4x2 ||
-      t == exec_aten::ScalarType::Bits8 || t == exec_aten::ScalarType::Bits16;
+constexpr bool isBitsType(::executorch::aten::ScalarType t) {
+  return t == ::executorch::aten::ScalarType::Bits1x8 ||
+      t == ::executorch::aten::ScalarType::Bits2x4 ||
+      t == ::executorch::aten::ScalarType::Bits4x2 ||
+      t == ::executorch::aten::ScalarType::Bits8 ||
+      t == ::executorch::aten::ScalarType::Bits16;
 }
 
 template <typename T>
@@ -405,13 +485,13 @@ struct is_bits_type
     : std::integral_constant<bool, isBitsType(CppTypeToScalarType<T>::value)> {
 };
 
-constexpr bool isQIntType(exec_aten::ScalarType t) {
+constexpr bool isQIntType(::executorch::aten::ScalarType t) {
   // Don't forget to extend this when adding new QInt types
-  return t == exec_aten::ScalarType::QInt8 ||
-      t == exec_aten::ScalarType::QUInt8 ||
-      t == exec_aten::ScalarType::QInt32 ||
-      t == exec_aten::ScalarType::QUInt4x2 ||
-      t == exec_aten::ScalarType::QUInt2x4;
+  return t == ::executorch::aten::ScalarType::QInt8 ||
+      t == ::executorch::aten::ScalarType::QUInt8 ||
+      t == ::executorch::aten::ScalarType::QInt32 ||
+      t == ::executorch::aten::ScalarType::QUInt4x2 ||
+      t == ::executorch::aten::ScalarType::QUInt2x4;
 }
 
 template <typename T>
@@ -419,49 +499,51 @@ struct is_qint_type
     : std::integral_constant<bool, isQIntType(CppTypeToScalarType<T>::value)> {
 };
 
-inline exec_aten::ScalarType toQIntType(exec_aten::ScalarType t) {
+inline ::executorch::aten::ScalarType toQIntType(
+    ::executorch::aten::ScalarType t) {
   switch (t) {
-    case exec_aten::ScalarType::Byte:
-      return exec_aten::ScalarType::QUInt8;
-    case exec_aten::ScalarType::Char:
-      return exec_aten::ScalarType::QInt8;
-    case exec_aten::ScalarType::Int:
-      return exec_aten::ScalarType::QInt32;
+    case ::executorch::aten::ScalarType::Byte:
+      return ::executorch::aten::ScalarType::QUInt8;
+    case ::executorch::aten::ScalarType::Char:
+      return ::executorch::aten::ScalarType::QInt8;
+    case ::executorch::aten::ScalarType::Int:
+      return ::executorch::aten::ScalarType::QInt32;
     default:
       return t;
   }
 }
 
-inline exec_aten::ScalarType toUnderlying(exec_aten::ScalarType t) {
+inline ::executorch::aten::ScalarType toUnderlying(
+    ::executorch::aten::ScalarType t) {
   switch (t) {
-    case exec_aten::ScalarType::QUInt8:
-      return exec_aten::ScalarType::Byte;
-    case exec_aten::ScalarType::QInt8:
-      return exec_aten::ScalarType::Char;
-    case exec_aten::ScalarType::QInt32:
-      return exec_aten::ScalarType::Int;
-    case exec_aten::ScalarType::QUInt4x2:
-      return exec_aten::ScalarType::Byte;
-    case exec_aten::ScalarType::QUInt2x4:
-      return exec_aten::ScalarType::Byte;
+    case ::executorch::aten::ScalarType::QUInt8:
+      return ::executorch::aten::ScalarType::Byte;
+    case ::executorch::aten::ScalarType::QInt8:
+      return ::executorch::aten::ScalarType::Char;
+    case ::executorch::aten::ScalarType::QInt32:
+      return ::executorch::aten::ScalarType::Int;
+    case ::executorch::aten::ScalarType::QUInt4x2:
+      return ::executorch::aten::ScalarType::Byte;
+    case ::executorch::aten::ScalarType::QUInt2x4:
+      return ::executorch::aten::ScalarType::Byte;
     default:
       return t;
   }
 }
 
-inline bool isSignedType(exec_aten::ScalarType t) {
+inline bool isSignedType(::executorch::aten::ScalarType t) {
   ET_CHECK_MSG(
-      !executorch::runtime::isQIntType(t),
+      !::executorch::runtime::isQIntType(t),
       "isSignedType not supported for quantized types like %" PRId8,
       static_cast<int8_t>(t));
-#define CASE_SIGNED(ctype, name)    \
-  case exec_aten::ScalarType::name: \
+#define CASE_SIGNED(ctype, name)             \
+  case ::executorch::aten::ScalarType::name: \
     return std::numeric_limits<ctype>::is_signed;
 
   switch (t) {
-    case exec_aten::ScalarType::ComplexHalf:
-    case exec_aten::ScalarType::ComplexFloat:
-    case exec_aten::ScalarType::ComplexDouble:
+    case ::executorch::aten::ScalarType::ComplexHalf:
+    case ::executorch::aten::ScalarType::ComplexFloat:
+    case ::executorch::aten::ScalarType::ComplexDouble:
       return true;
       ET_FORALL_REAL_TYPES_AND3(Half, Bool, BFloat16, CASE_SIGNED)
     default:
@@ -471,42 +553,44 @@ inline bool isSignedType(exec_aten::ScalarType t) {
 }
 
 inline bool isUnderlying(
-    exec_aten::ScalarType type,
-    exec_aten::ScalarType qtype) {
-  return type == executorch::runtime::toUnderlying(qtype);
+    ::executorch::aten::ScalarType type,
+    ::executorch::aten::ScalarType qtype) {
+  return type == ::executorch::runtime::toUnderlying(qtype);
 }
 
-inline exec_aten::ScalarType toRealValueType(exec_aten::ScalarType t) {
+inline ::executorch::aten::ScalarType toRealValueType(
+    ::executorch::aten::ScalarType t) {
   switch (t) {
-    case exec_aten::ScalarType::ComplexHalf:
-      return exec_aten::ScalarType::Half;
-    case exec_aten::ScalarType::ComplexFloat:
-      return exec_aten::ScalarType::Float;
-    case exec_aten::ScalarType::ComplexDouble:
-      return exec_aten::ScalarType::Double;
+    case ::executorch::aten::ScalarType::ComplexHalf:
+      return ::executorch::aten::ScalarType::Half;
+    case ::executorch::aten::ScalarType::ComplexFloat:
+      return ::executorch::aten::ScalarType::Float;
+    case ::executorch::aten::ScalarType::ComplexDouble:
+      return ::executorch::aten::ScalarType::Double;
     default:
       return t;
   }
 }
 
-inline exec_aten::ScalarType toComplexType(exec_aten::ScalarType t) {
+inline ::executorch::aten::ScalarType toComplexType(
+    ::executorch::aten::ScalarType t) {
   switch (t) {
-    case exec_aten::ScalarType::BFloat16:
+    case ::executorch::aten::ScalarType::BFloat16:
       // BFloat16 has range equivalent to Float,
       // so we map it to ComplexFloat.
-      return exec_aten::ScalarType::ComplexFloat;
-    case exec_aten::ScalarType::Half:
-      return exec_aten::ScalarType::ComplexHalf;
-    case exec_aten::ScalarType::Float:
-      return exec_aten::ScalarType::ComplexFloat;
-    case exec_aten::ScalarType::Double:
-      return exec_aten::ScalarType::ComplexDouble;
-    case exec_aten::ScalarType::ComplexHalf:
-      return exec_aten::ScalarType::ComplexHalf;
-    case exec_aten::ScalarType::ComplexFloat:
-      return exec_aten::ScalarType::ComplexFloat;
-    case exec_aten::ScalarType::ComplexDouble:
-      return exec_aten::ScalarType::ComplexDouble;
+      return ::executorch::aten::ScalarType::ComplexFloat;
+    case ::executorch::aten::ScalarType::Half:
+      return ::executorch::aten::ScalarType::ComplexHalf;
+    case ::executorch::aten::ScalarType::Float:
+      return ::executorch::aten::ScalarType::ComplexFloat;
+    case ::executorch::aten::ScalarType::Double:
+      return ::executorch::aten::ScalarType::ComplexDouble;
+    case ::executorch::aten::ScalarType::ComplexHalf:
+      return ::executorch::aten::ScalarType::ComplexHalf;
+    case ::executorch::aten::ScalarType::ComplexFloat:
+      return ::executorch::aten::ScalarType::ComplexFloat;
+    case ::executorch::aten::ScalarType::ComplexDouble:
+      return ::executorch::aten::ScalarType::ComplexDouble;
     default:
       ET_CHECK_MSG(
           false,
@@ -519,17 +603,17 @@ inline exec_aten::ScalarType toComplexType(exec_aten::ScalarType t) {
  * Encodes type casting rules that are consistent with ATen behaviour.
  */
 inline constexpr bool canCast(
-    const exec_aten::ScalarType from,
-    const exec_aten::ScalarType to) {
+    const ::executorch::aten::ScalarType from,
+    const ::executorch::aten::ScalarType to) {
   // Disallow complex -> non-complex
-  return !(executorch::runtime::isComplexType(from) &&
-           !executorch::runtime::isComplexType(to)) &&
+  return !(::executorch::runtime::isComplexType(from) &&
+           !::executorch::runtime::isComplexType(to)) &&
       // Disallow float -> integral
-      !(executorch::runtime::isFloatingType(from) &&
-        executorch::runtime::isIntegralType(to, /*includeBool=*/false)) &&
+      !(::executorch::runtime::isFloatingType(from) &&
+        ::executorch::runtime::isIntegralType(to, /*includeBool=*/false)) &&
       // Treat bool as a special category. Disallow non-bool -> bool
-      !(from != exec_aten::ScalarType::Bool &&
-        to == exec_aten::ScalarType::Bool);
+      !(from != ::executorch::aten::ScalarType::Bool &&
+        to == ::executorch::aten::ScalarType::Bool);
 }
 
 template <typename T1, typename T2>
@@ -574,21 +658,32 @@ struct promote_types_lookup<T1, T1> {
   using type = T1;
 };
 
-using U1 = typename ScalarTypeToCppType<exec_aten::ScalarType::Byte>::type;
-using I1 = typename ScalarTypeToCppType<exec_aten::ScalarType::Char>::type;
-using I2 = typename ScalarTypeToCppType<exec_aten::ScalarType::Short>::type;
-using I4 = typename ScalarTypeToCppType<exec_aten::ScalarType::Int>::type;
-using I8 = typename ScalarTypeToCppType<exec_aten::ScalarType::Long>::type;
-using F2 = typename ScalarTypeToCppType<exec_aten::ScalarType::Half>::type;
-using F4 = typename ScalarTypeToCppType<exec_aten::ScalarType::Float>::type;
-using F8 = typename ScalarTypeToCppType<exec_aten::ScalarType::Double>::type;
-using C2 =
-    typename ScalarTypeToCppType<exec_aten::ScalarType::ComplexHalf>::type;
-using C4 =
-    typename ScalarTypeToCppType<exec_aten::ScalarType::ComplexFloat>::type;
-using C8 =
-    typename ScalarTypeToCppType<exec_aten::ScalarType::ComplexDouble>::type;
-using B1 = typename ScalarTypeToCppType<exec_aten::ScalarType::Bool>::type;
+using U1 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Byte>::type;
+using I1 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Char>::type;
+using I2 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Short>::type;
+using I4 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Int>::type;
+using I8 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Long>::type;
+using F2 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Half>::type;
+using F4 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Float>::type;
+using F8 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Double>::type;
+using C2 = typename ScalarTypeToCppType<
+    ::executorch::aten::ScalarType::ComplexHalf>::type;
+using C4 = typename ScalarTypeToCppType<
+    ::executorch::aten::ScalarType::ComplexFloat>::type;
+using C8 = typename ScalarTypeToCppType<
+    ::executorch::aten::ScalarType::ComplexDouble>::type;
+using B1 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Bool>::type;
+using BF = typename ScalarTypeToCppType<
+    ::executorch::aten::ScalarType::BFloat16>::type;
 
 #define TABLE_ENTRY(key1, key2, value)      \
   template <>                               \
@@ -613,6 +708,7 @@ TABLE_ENTRY(U1, C2, C2);
 TABLE_ENTRY(U1, C4, C4);
 TABLE_ENTRY(U1, C8, C8);
 TABLE_ENTRY(U1, B1, U1);
+TABLE_ENTRY(U1, BF, BF);
 TABLE_ENTRY(I1, U1, I2);
 TABLE_ENTRY(I1, I1, I1);
 TABLE_ENTRY(I1, I2, I2);
@@ -625,6 +721,7 @@ TABLE_ENTRY(I1, C2, C2);
 TABLE_ENTRY(I1, C4, C4);
 TABLE_ENTRY(I1, C8, C8);
 TABLE_ENTRY(I1, B1, I1);
+TABLE_ENTRY(I1, BF, BF);
 TABLE_ENTRY(I2, U1, I2);
 TABLE_ENTRY(I2, I1, I2);
 TABLE_ENTRY(I2, I2, I2);
@@ -637,6 +734,7 @@ TABLE_ENTRY(I2, C2, C2);
 TABLE_ENTRY(I2, C4, C4);
 TABLE_ENTRY(I2, C8, C8);
 TABLE_ENTRY(I2, B1, I2);
+TABLE_ENTRY(I2, BF, BF);
 TABLE_ENTRY(I4, U1, I4);
 TABLE_ENTRY(I4, I1, I4);
 TABLE_ENTRY(I4, I2, I4);
@@ -649,6 +747,7 @@ TABLE_ENTRY(I4, C2, C2);
 TABLE_ENTRY(I4, C4, C4);
 TABLE_ENTRY(I4, C8, C8);
 TABLE_ENTRY(I4, B1, I4);
+TABLE_ENTRY(I4, BF, BF);
 TABLE_ENTRY(I8, U1, I8);
 TABLE_ENTRY(I8, I1, I8);
 TABLE_ENTRY(I8, I2, I8);
@@ -661,6 +760,7 @@ TABLE_ENTRY(I8, C2, C2);
 TABLE_ENTRY(I8, C4, C4);
 TABLE_ENTRY(I8, C8, C8);
 TABLE_ENTRY(I8, B1, I8);
+TABLE_ENTRY(I8, BF, BF);
 TABLE_ENTRY(F2, U1, F2);
 TABLE_ENTRY(F2, I1, F2);
 TABLE_ENTRY(F2, I2, F2);
@@ -673,6 +773,7 @@ TABLE_ENTRY(F2, C2, C2);
 TABLE_ENTRY(F2, C4, C4);
 TABLE_ENTRY(F2, C8, C8);
 TABLE_ENTRY(F2, B1, F2);
+TABLE_ENTRY(F2, BF, F4);
 TABLE_ENTRY(F4, U1, F4);
 TABLE_ENTRY(F4, I1, F4);
 TABLE_ENTRY(F4, I2, F4);
@@ -685,6 +786,7 @@ TABLE_ENTRY(F4, C2, C4);
 TABLE_ENTRY(F4, C4, C4);
 TABLE_ENTRY(F4, C8, C8);
 TABLE_ENTRY(F4, B1, F4);
+TABLE_ENTRY(F4, BF, F4);
 TABLE_ENTRY(F8, U1, F8);
 TABLE_ENTRY(F8, I1, F8);
 TABLE_ENTRY(F8, I2, F8);
@@ -697,6 +799,7 @@ TABLE_ENTRY(F8, C2, C8);
 TABLE_ENTRY(F8, C4, C8);
 TABLE_ENTRY(F8, C8, C8);
 TABLE_ENTRY(F8, B1, F8);
+TABLE_ENTRY(F8, BF, F8);
 TABLE_ENTRY(C2, U1, C2);
 TABLE_ENTRY(C2, I1, C2);
 TABLE_ENTRY(C2, I2, C2);
@@ -709,6 +812,7 @@ TABLE_ENTRY(C2, C2, C2);
 TABLE_ENTRY(C2, C4, C4);
 TABLE_ENTRY(C2, C8, C8);
 TABLE_ENTRY(C2, B1, C2);
+TABLE_ENTRY(C2, BF, C4);
 TABLE_ENTRY(C4, U1, C4);
 TABLE_ENTRY(C4, I1, C4);
 TABLE_ENTRY(C4, I2, C4);
@@ -721,6 +825,7 @@ TABLE_ENTRY(C4, C2, C4);
 TABLE_ENTRY(C4, C4, C4);
 TABLE_ENTRY(C4, C8, C8);
 TABLE_ENTRY(C4, B1, C4);
+TABLE_ENTRY(C4, BF, C4);
 TABLE_ENTRY(C8, U1, C8);
 TABLE_ENTRY(C8, I1, C8);
 TABLE_ENTRY(C8, I2, C8);
@@ -733,6 +838,7 @@ TABLE_ENTRY(C8, C2, C8);
 TABLE_ENTRY(C8, C4, C8);
 TABLE_ENTRY(C8, C8, C8);
 TABLE_ENTRY(C8, B1, C8);
+TABLE_ENTRY(C8, BF, C8);
 TABLE_ENTRY(B1, U1, U1);
 TABLE_ENTRY(B1, I1, I1);
 TABLE_ENTRY(B1, I2, I2);
@@ -745,6 +851,20 @@ TABLE_ENTRY(B1, C2, C2);
 TABLE_ENTRY(B1, C4, C4);
 TABLE_ENTRY(B1, C8, C8);
 TABLE_ENTRY(B1, B1, B1);
+TABLE_ENTRY(B1, BF, BF);
+TABLE_ENTRY(BF, U1, BF);
+TABLE_ENTRY(BF, I1, BF);
+TABLE_ENTRY(BF, I2, BF);
+TABLE_ENTRY(BF, I4, BF);
+TABLE_ENTRY(BF, I8, BF);
+TABLE_ENTRY(BF, F2, F4);
+TABLE_ENTRY(BF, F4, F4);
+TABLE_ENTRY(BF, F8, F8);
+TABLE_ENTRY(BF, C2, C4);
+TABLE_ENTRY(BF, C4, C4);
+TABLE_ENTRY(BF, C8, C8);
+TABLE_ENTRY(BF, B1, BF);
+TABLE_ENTRY(BF, BF, BF);
 
 } // namespace internal
 
@@ -760,100 +880,105 @@ struct promote_types {
           (!is_bits_type<T1>::value && !is_bits_type<T2>::value),
       "promote_types not valid for bits dtypes");
 
-  static_assert(
-      !std::is_same<
-          T1,
-          typename ScalarTypeToCppType<exec_aten::ScalarType::BFloat16>::type>::
-              value &&
-          !std::is_same<
-              T2,
-              typename ScalarTypeToCppType<
-                  exec_aten::ScalarType::BFloat16>::type>::value,
-      "promote_types not valid for BFloat16");
   using promoted_type_not_respecting_half_to_float =
       typename internal::promote_types_lookup<T1, T2>::type;
 
  public:
   using type = typename std::conditional<
       half_to_float &&
-          std::is_same<
-              promoted_type_not_respecting_half_to_float,
-              typename ScalarTypeToCppType<exec_aten::ScalarType::Half>::type>::
-              value,
-      typename ScalarTypeToCppType<exec_aten::ScalarType::Float>::type,
+          (std::is_same<
+               promoted_type_not_respecting_half_to_float,
+               typename ScalarTypeToCppType<
+                   ::executorch::aten::ScalarType::Half>::type>::value ||
+           std::is_same<
+               promoted_type_not_respecting_half_to_float,
+               typename ScalarTypeToCppType<
+                   ::executorch::aten::ScalarType::BFloat16>::type>::value),
+      typename ScalarTypeToCppType<::executorch::aten::ScalarType::Float>::type,
       promoted_type_not_respecting_half_to_float>::type;
 };
 
 /**
  * Implements type promotion rules that are consistent with ATen behaviour,
  * which in turn is consistent with NumPy's promote_types.
- * If half_to_float is set to true, then half will be promoted to float instead
+ * If half_to_float is set to true, then half and bfloat16 will be promoted to
+ * float instead
  */
-inline exec_aten::ScalarType promoteTypes(
-    exec_aten::ScalarType a,
-    exec_aten::ScalarType b,
+inline ::executorch::aten::ScalarType promoteTypes(
+    ::executorch::aten::ScalarType a,
+    ::executorch::aten::ScalarType b,
     bool half_to_float = false) {
   // This is generated according to NumPy's promote_types
-  constexpr auto u1 = exec_aten::ScalarType::Byte;
-  constexpr auto i1 = exec_aten::ScalarType::Char;
-  constexpr auto i2 = exec_aten::ScalarType::Short;
-  constexpr auto i4 = exec_aten::ScalarType::Int;
-  constexpr auto i8 = exec_aten::ScalarType::Long;
-  constexpr auto f2 = exec_aten::ScalarType::Half;
-  constexpr auto f4 = exec_aten::ScalarType::Float;
-  constexpr auto f8 = exec_aten::ScalarType::Double;
-  constexpr auto c2 = exec_aten::ScalarType::ComplexHalf;
-  constexpr auto c4 = exec_aten::ScalarType::ComplexFloat;
-  constexpr auto c8 = exec_aten::ScalarType::ComplexDouble;
-  constexpr auto b1 = exec_aten::ScalarType::Bool;
+  constexpr auto u1 = ::executorch::aten::ScalarType::Byte;
+  constexpr auto i1 = ::executorch::aten::ScalarType::Char;
+  constexpr auto i2 = ::executorch::aten::ScalarType::Short;
+  constexpr auto i4 = ::executorch::aten::ScalarType::Int;
+  constexpr auto i8 = ::executorch::aten::ScalarType::Long;
+  constexpr auto f2 = ::executorch::aten::ScalarType::Half;
+  constexpr auto f4 = ::executorch::aten::ScalarType::Float;
+  constexpr auto f8 = ::executorch::aten::ScalarType::Double;
+  constexpr auto c2 = ::executorch::aten::ScalarType::ComplexHalf;
+  constexpr auto c4 = ::executorch::aten::ScalarType::ComplexFloat;
+  constexpr auto c8 = ::executorch::aten::ScalarType::ComplexDouble;
+  constexpr auto b1 = ::executorch::aten::ScalarType::Bool;
+  constexpr auto bf = ::executorch::aten::ScalarType::BFloat16;
 
   // For QInt types, only allow exact match
-  if (executorch::runtime::isQIntType(a) && a == b) {
+  if (::executorch::runtime::isQIntType(a) && a == b) {
     return a;
   }
-  if (executorch::runtime::isQIntType(a) ||
-      executorch::runtime::isQIntType(b)) {
+  if (::executorch::runtime::isQIntType(a) ||
+      ::executorch::runtime::isQIntType(b)) {
     ET_CHECK_MSG(false, "promoteTypes not valid for quantized dtypes");
   }
 
   // For Bits types, only allow exact match
-  if (executorch::runtime::isBitsType(a) && a == b) {
+  if (::executorch::runtime::isBitsType(a) && a == b) {
     return a;
   }
-  if (executorch::runtime::isBitsType(a) ||
-      executorch::runtime::isBitsType(b)) {
+  if (::executorch::runtime::isBitsType(a) ||
+      ::executorch::runtime::isBitsType(b)) {
     ET_CHECK_MSG(false, "promoteTypes not valid for bits dtypes");
   }
 
-  ET_CHECK_MSG(
-      a != exec_aten::ScalarType::BFloat16 &&
-          b != exec_aten::ScalarType::BFloat16,
-      "promoteTypes not valid for BFloat16");
   // 12 types are handled by this function, see the constexpr definitions above
-  const int NUM_PROMOTE_TYPES = 12;
-
-  static constexpr exec_aten::ScalarType
+  const int NUM_PROMOTE_TYPES = 13;
+
+  static constexpr std::
+      array<int, int(::executorch::aten::ScalarType::NumOptions)>
+          dtype2index = {{
+              0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+              -1, -1, -1, 12, -1, -1, -1, -1, -1, -1, -1, -1,
+          }};
+  auto ix_a = dtype2index[(int)a];
+  ET_CHECK(ix_a != -1);
+  auto ix_b = dtype2index[(int)b];
+  ET_CHECK(ix_b != -1);
+  static constexpr ::executorch::aten::ScalarType
       _promoteTypesLookup[NUM_PROMOTE_TYPES][NUM_PROMOTE_TYPES] = {
-          /*        u1  i1  i2  i4  i8  f2  f4  f8  c2  c4  c8  b1  */
-          /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1},
-          /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1},
-          /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2},
-          /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4},
-          /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8},
-          /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2},
-          /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4},
-          /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8},
-          /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2},
-          /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4},
-          /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8},
-          /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1},
+          /*        u1  i1  i2  i4  i8  f2  f4  f8  c2  c4  c8  b1  bf*/
+          /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1, bf},
+          /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1, bf},
+          /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2, bf},
+          /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4, bf},
+          /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8, bf},
+          /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2, f4},
+          /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4, f4},
+          /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8, f8},
+          /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2, c4},
+          /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4, c4},
+          /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8},
+          /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1, bf},
+          /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, c4, c4, c8, bf, bf},
       };
 
-  exec_aten::ScalarType promoted_type =
-      _promoteTypesLookup[static_cast<int>(a)][static_cast<int>(b)];
+  ::executorch::aten::ScalarType promoted_type =
+      _promoteTypesLookup[ix_a][ix_b];
 
-  if (half_to_float && promoted_type == exec_aten::ScalarType::Half) {
-    promoted_type = exec_aten::ScalarType::Float;
+  if (half_to_float &&
+      (promoted_type == ::executorch::aten::ScalarType::Half ||
+       promoted_type == ::executorch::aten::ScalarType::BFloat16)) {
+    promoted_type = ::executorch::aten::ScalarType::Float;
   }
 
   return promoted_type;
@@ -868,17 +993,19 @@ inline exec_aten::ScalarType promoteTypes(
 //
 
 #ifdef ET_INTERNAL_CHECK_SELECTIVE_BUILD
-#define ET_INTERNAL_SWITCH_CASE(enum_type, CTYPE_ALIAS, ...)  \
-  case enum_type: {                                           \
-    ET_INTERNAL_CHECK_SELECTIVE_BUILD(enum_type);             \
-    using CTYPE_ALIAS = ScalarTypeToCppType<enum_type>::type; \
-    return __VA_ARGS__();                                     \
+#define ET_INTERNAL_SWITCH_CASE(enum_type, CTYPE_ALIAS, ...)         \
+  case enum_type: {                                                  \
+    ET_INTERNAL_CHECK_SELECTIVE_BUILD(enum_type);                    \
+    using CTYPE_ALIAS =                                              \
+        ::executorch::runtime::ScalarTypeToCppType<enum_type>::type; \
+    return __VA_ARGS__();                                            \
   }
 #else
-#define ET_INTERNAL_SWITCH_CASE(enum_type, CTYPE_ALIAS, ...)  \
-  case enum_type: {                                           \
-    using CTYPE_ALIAS = ScalarTypeToCppType<enum_type>::type; \
-    return __VA_ARGS__();                                     \
+#define ET_INTERNAL_SWITCH_CASE(enum_type, CTYPE_ALIAS, ...)         \
+  case enum_type: {                                                  \
+    using CTYPE_ALIAS =                                              \
+        ::executorch::runtime::ScalarTypeToCppType<enum_type>::type; \
+    return __VA_ARGS__();                                            \
   }
 #endif
 
@@ -886,6 +1013,7 @@ inline exec_aten::ScalarType promoteTypes(
   [&] {                                              \
     const auto& _st = TYPE;                          \
     constexpr const char* et_switch_name = NAME;     \
+    (void)et_switch_name; /* Suppress unused var */  \
     switch (_st) {                                   \
       __VA_ARGS__                                    \
       default:                                       \
@@ -897,152 +1025,168 @@ inline exec_aten::ScalarType promoteTypes(
     }                                                \
   }()
 
-#define ET_INTERNAL_SWITCH_CASE_ALL_TYPES(CTYPE_ALIAS, ...)           \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__)         \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)           \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Half, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__)         \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)        \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::ComplexHalf, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__)         \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__)        \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__)        \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::BFloat16, CTYPE_ALIAS, __VA_ARGS__)      \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__)      \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__)      \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Bits1x8, CTYPE_ALIAS, __VA_ARGS__)       \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Bits2x4, CTYPE_ALIAS, __VA_ARGS__)       \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Bits4x2, CTYPE_ALIAS, __VA_ARGS__)       \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Bits8, CTYPE_ALIAS, __VA_ARGS__)         \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Bits16, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, ...)  \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
+#define ET_INTERNAL_SWITCH_CASE_ALL_TYPES(CTYPE_ALIAS, ...)                    \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)          \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)          \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__)         \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)           \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)          \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Half, CTYPE_ALIAS, __VA_ARGS__)          \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__)         \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)        \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::ComplexHalf, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)          \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__)         \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__)        \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__)        \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::BFloat16, CTYPE_ALIAS, __VA_ARGS__)      \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__)      \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__)      \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Bits1x8, CTYPE_ALIAS, __VA_ARGS__)       \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Bits2x4, CTYPE_ALIAS, __VA_ARGS__)       \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Bits4x2, CTYPE_ALIAS, __VA_ARGS__)       \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Bits8, CTYPE_ALIAS, __VA_ARGS__)         \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Bits16, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, ...)           \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
   ET_INTERNAL_SWITCH_CASE(                                                   \
-      exec_aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2(                    \
-    ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, ...)                     \
-  ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)      \
-  ET_INTERNAL_SWITCH_CASE(                                          \
-      exec_aten::ScalarType::ADDITIONAL1, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                          \
-      exec_aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...)   \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(exec_aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2(                             \
+    ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, ...)                              \
+  ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
+  ET_INTERNAL_SWITCH_CASE(                                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL1, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3(             \
+    ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2(                   \
+      ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)    \
+  ET_INTERNAL_SWITCH_CASE(                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...)            \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_INT_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
   ET_INTERNAL_SWITCH_CASE(                                                  \
-      exec_aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
 
-#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, ...)  \
-  ET_INTERNAL_SWITCH_CASE(                                     \
-      exec_aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                     \
-      exec_aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__)
+#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, ...)           \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
   ET_INTERNAL_SWITCH_CASE(                                                    \
-      exec_aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, ...)     \
-  ET_INTERNAL_SWITCH_CASE(                                       \
-      exec_aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__)    \
-  ET_INTERNAL_SWITCH_CASE(                                       \
-      exec_aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(                                       \
-      exec_aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(                                       \
-      exec_aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                       \
-      exec_aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, ...)      \
-  ET_INTERNAL_SWITCH_CASE(                                           \
-      exec_aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                           \
-      exec_aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_TYPES(CTYPE_ALIAS, ...) \
-  ET_INTERNAL_SWITCH_CASE(                                         \
-      exec_aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)       \
-  ET_INTERNAL_SWITCH_CASE(                                         \
-      exec_aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)       \
-  ET_INTERNAL_SWITCH_CASE(                                         \
-      exec_aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND2( \
+    ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, ...)   \
+  ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND(        \
+      ADDITIONAL1, CTYPE_ALIAS, __VA_ARGS__)      \
+  ET_INTERNAL_SWITCH_CASE(                        \
+      ::executorch::aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, ...)              \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__)    \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, ...)               \
+  ET_INTERNAL_SWITCH_CASE(                                                    \
+      ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                    \
+      ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_TYPES(CTYPE_ALIAS, ...)    \
+  ET_INTERNAL_SWITCH_CASE(                                            \
+      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                            \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                            \
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_REAL_TYPES(CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE(                                              \
-      exec_aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)            \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)   \
   ET_INTERNAL_SWITCH_CASE(                                              \
-      exec_aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_INTB_TYPES(CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE(                                              \
-      exec_aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)            \
-  ET_INTERNAL_SWITCH_CASE(exec_aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_FLOATB_TYPES(CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE(                                                \
-      exec_aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)              \
+      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)     \
   ET_INTERNAL_SWITCH_CASE(                                                \
-      exec_aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
 
 //
 // Switch case macros
@@ -1055,7 +1199,7 @@ inline exec_aten::ScalarType promoteTypes(
 // Arguments:
 //   - ADDITIONAL: Additional ScalarType case to add
 //   - TYPE: The ScalarType to handle through the switch statement
-//   - CONTEXT: The RuntimeContext instance used for error handling, etc.
+//   - CONTEXT: The KernelRuntimeContext instance used for error handling, etc.
 //   - NAME: A name for this operation which will be used in error messages
 //   - CTYPE_ALIAS: A typedef for the ctype associated with the ScalarType.
 //   - [&](){...}: A lambda function to be applied to each ScalarType case
@@ -1112,6 +1256,22 @@ inline exec_aten::ScalarType promoteTypes(
       ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2(                       \
           ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__))
 
+#define ET_SWITCH_REAL_TYPES_AND3(             \
+    ADDITIONAL1,                               \
+    ADDITIONAL2,                               \
+    ADDITIONAL3,                               \
+    TYPE,                                      \
+    CONTEXT,                                   \
+    NAME,                                      \
+    CTYPE_ALIAS,                               \
+    ...)                                       \
+  ET_INTERNAL_SWITCH(                          \
+      TYPE,                                    \
+      CONTEXT,                                 \
+      NAME,                                    \
+      ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3( \
+          ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__))
+
 #define ET_SWITCH_REALH_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
   ET_SWITCH_REAL_TYPES_AND(Half, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
 
@@ -1122,6 +1282,10 @@ inline exec_aten::ScalarType promoteTypes(
   ET_SWITCH_REAL_TYPES_AND2(                                          \
       Half, Bool, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
 
+#define ET_SWITCH_REALHBBF16_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_SWITCH_REAL_TYPES_AND3(                                              \
+      Half, Bool, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
+
 #define ET_SWITCH_INT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH(                                              \
       TYPE,                                                        \
@@ -1154,9 +1318,22 @@ inline exec_aten::ScalarType promoteTypes(
       ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND(         \
           ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__))
 
+#define ET_SWITCH_FLOAT_TYPES_AND2(                                  \
+    ADDITIONAL1, ADDITIONAL2, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                                \
+      TYPE,                                                          \
+      CONTEXT,                                                       \
+      NAME,                                                          \
+      ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND2(                      \
+          ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__))
+
 #define ET_SWITCH_FLOATH_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
   ET_SWITCH_FLOAT_TYPES_AND(Half, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
 
+#define ET_SWITCH_FLOATHBF16_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_SWITCH_FLOAT_TYPES_AND2(                                             \
+      Half, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
+
 #define ET_SWITCH_QINT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH(                                               \
       TYPE,                                                         \
@@ -1207,13 +1384,38 @@ inline exec_aten::ScalarType promoteTypes(
       CONTEXT,                                                             \
       NAME,                                                                \
       ET_INTERNAL_SWITCH_CASE(                                             \
-          exec_aten::ScalarType::T1, CTYPE_ALIAS, __VA_ARGS__)             \
+          ::executorch::aten::ScalarType::T1, CTYPE_ALIAS, __VA_ARGS__)    \
           ET_INTERNAL_SWITCH_CASE(                                         \
-              exec_aten::ScalarType::T2, CTYPE_ALIAS, __VA_ARGS__))
+              ::executorch::aten::ScalarType::T2, CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_THREE_TYPES(                                              \
+    T1, T2, T3, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...)                      \
+  ET_INTERNAL_SWITCH(                                                       \
+      TYPE,                                                                 \
+      CONTEXT,                                                              \
+      NAME,                                                                 \
+      ET_INTERNAL_SWITCH_CASE(                                              \
+          ::executorch::aten::ScalarType::T1, CTYPE_ALIAS, __VA_ARGS__)     \
+          ET_INTERNAL_SWITCH_CASE(                                          \
+              ::executorch::aten::ScalarType::T2, CTYPE_ALIAS, __VA_ARGS__) \
+              ET_INTERNAL_SWITCH_CASE(                                      \
+                  ::executorch::aten::ScalarType::T3,                       \
+                  CTYPE_ALIAS,                                              \
+                  __VA_ARGS__))
 
 } // namespace runtime
 } // namespace executorch
 
+namespace executorch {
+namespace aten {
+#ifdef USE_ATEN_LIB
+using ::at::elementSize;
+#else // USE_ATEN_LIB
+using ::executorch::runtime::elementSize;
+#endif // USE_ATEN_LIB
+} // namespace aten
+} // namespace executorch
+
 namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h
index b18cd349a62..b303feafd46 100644
--- a/runtime/core/exec_aten/util/tensor_util.h
+++ b/runtime/core/exec_aten/util/tensor_util.h
@@ -235,8 +235,9 @@
  */
 #define ET_CHECK_CONTIGUOUS(a__)                                              \
   ({                                                                          \
-    const ::exec_aten::ArrayRef<int32_t> strides = a__.strides();             \
-    const ::exec_aten::ArrayRef<int32_t> sizes = a__.sizes();                 \
+    const ::exec_aten::ArrayRef<exec_aten::StridesType> strides =             \
+        a__.strides();                                                        \
+    const ::exec_aten::ArrayRef<exec_aten::StridesType> sizes = a__.sizes();  \
     ET_CHECK_MSG(                                                             \
         strides[strides.size() - 1] == 1,                                     \
         "The stride of the last dimension shall be 1 for contiguous tensor, " \
@@ -267,8 +268,10 @@
         "Two tensors shall have same number of strides, but not %zu and %zu.", \
         a__.dim(),                                                             \
         b__.dim());                                                            \
-    const ::exec_aten::ArrayRef<int32_t> a_strides = a__.strides();            \
-    const ::exec_aten::ArrayRef<int32_t> b_strides = b__.strides();            \
+    const ::exec_aten::ArrayRef<exec_aten::StridesType> a_strides =            \
+        a__.strides();                                                         \
+    const ::exec_aten::ArrayRef<exec_aten::StridesType> b_strides =            \
+        b__.strides();                                                         \
     for (size_t i = 0; i < a__.dim(); i++) {                                   \
       ET_CHECK_MSG(                                                            \
           a_strides[i] == b_strides[i],                                        \
@@ -276,8 +279,8 @@
           "but now is %d and %d.",                                             \
           i,                                                                   \
           i,                                                                   \
-          a_strides[i],                                                        \
-          b_strides[i]);                                                       \
+          (int32_t)a_strides[i],                                               \
+          (int32_t)b_strides[i]);                                              \
     }                                                                          \
   })
 
@@ -295,9 +298,12 @@
         a__.dim(),                                                      \
         b__.dim(),                                                      \
         c__.dim());                                                     \
-    const ::exec_aten::ArrayRef<int32_t> a_strides = a__.strides();     \
-    const ::exec_aten::ArrayRef<int32_t> b_strides = b__.strides();     \
-    const ::exec_aten::ArrayRef<int32_t> c_strides = c__.strides();     \
+    const ::exec_aten::ArrayRef<exec_aten::StridesType> a_strides =     \
+        a__.strides();                                                  \
+    const ::exec_aten::ArrayRef<exec_aten::StridesType> b_strides =     \
+        b__.strides();                                                  \
+    const ::exec_aten::ArrayRef<exec_aten::StridesType> c_strides =     \
+        c__.strides();                                                  \
     for (size_t i = 0; i < a__.dim(); i++) {                            \
       ET_CHECK_MSG(                                                     \
           a_strides[i] == b_strides[i] && b_strides[i] == c_strides[i], \
@@ -306,9 +312,9 @@
           i,                                                            \
           i,                                                            \
           i,                                                            \
-          a_strides[i],                                                 \
-          b_strides[i],                                                 \
-          c_strides[i]);                                                \
+          (int32_t)a_strides[i],                                        \
+          (int32_t)b_strides[i],                                        \
+          (int32_t)c_strides[i]);                                       \
     }                                                                   \
   })
 
@@ -510,6 +516,15 @@ inline bool tensor_is_realhb_type(exec_aten::Tensor t) {
   return true;
 }
 
+inline bool tensor_is_realhbbf16_type(exec_aten::Tensor t) {
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      executorch::runtime::isRealHBBF16Type(t.scalar_type()),
+      "Expected to find a real type, but tensor has type %s",
+      torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
 inline bool tensor_is_complex_type(exec_aten::Tensor t) {
   ET_LOG_MSG_AND_RETURN_IF_FALSE(
       torch::executor::isComplexType(t.scalar_type()),
@@ -848,11 +863,11 @@ inline bool tensor_is_scalar(exec_aten::Tensor t) {
 
 /**
  * The expected output size may not be the existing size of any inputs and
- * outputs if the operator supports both broadcast and dynamic shape. Therefore
- * such operators needs extra space to store the calculated expected output
- * size. such dynamic allocation is troublesome in executorch so we can just
- * hard code a static value of a relatively small value because users don't
- * create high dimensional tensors.
+ * outputs if the operator supports both broadcast and dynamic shape.
+ * Therefore such operators needs extra space to store the calculated expected
+ * output size. such dynamic allocation is troublesome in executorch so we can
+ * just hard code a static value of a relatively small value because users
+ * don't create high dimensional tensors.
  */
 constexpr size_t kTensorDimensionLimit = 16;
 
@@ -893,8 +908,8 @@ inline size_t getTrailingDims(const exec_aten::Tensor& tensor, int64_t dim) {
  * @param[in] tensor The tensor that will be indexed
  * @param[in] coordinate A n-dimensional array representing the coordinate to
  * index. It is assumed that the array has kTensorDimensionLimit elements.
- * @param[out] index The linear index to element at the specified coordinate in
- * the tensor.
+ * @param[out] index The linear index to element at the specified coordinate
+ * in the tensor.
  */
 inline size_t coordinateToIndex(
     const exec_aten::Tensor& tensor,
@@ -906,6 +921,38 @@ inline size_t coordinateToIndex(
   return index;
 }
 
+/**
+ * Produce a memoized array for use with repeated calls to
+ * coordinateToIndexWithTrailingDimsMemo, which will be faster than
+ * repeated calls to coordinateToIndex.
+ */
+inline void memoizeTrailingDims(
+    const exec_aten::Tensor& tensor,
+    size_t trailing_dims_memo[kTensorDimensionLimit]) {
+  const auto tensorDim = tensor.dim();
+  size_t dims = 1;
+  for (int ii = tensorDim - 1; ii >= 0; --ii) {
+    trailing_dims_memo[ii] = dims;
+    dims *= static_cast<size_t>(tensor.size(ii));
+  }
+}
+
+/**
+ * Like coordinateToIndex, but faster for repeated calls with the same
+ * tensor. trailing_dims_memo must be produced by a call to
+ * memoizeTrailingDims.
+ */
+inline size_t coordinateToIndexWithTrailingDimsMemo(
+    const exec_aten::Tensor& tensor,
+    const size_t* const coordinate,
+    const size_t trailing_dims_memo[kTensorDimensionLimit]) {
+  size_t index = 0;
+  for (int d = 0; d < tensor.dim(); ++d) {
+    index += coordinate[d] * trailing_dims_memo[d];
+  }
+  return index;
+}
+
 /**
  * Given the linear index return the N-dimensional tensor coordinate. This is
  * the inverse operation of coordinateToIndex.
@@ -935,10 +982,10 @@ inline void indexToCoordinate(
  *
  * @param[in] tensor The source of the value to extract.
  * @param[out] out_val The extracted value, on success.
- * @returns `true` if a value was extracted, and sets `*out_val` to that value.
- *    `false` if a value could not be extracted: either it was not an integer
- *    Scalar Tensor, or the value of that Scalar Tensor could not be represented
- *    by INT_T.
+ * @returns `true` if a value was extracted, and sets `*out_val` to that
+ * value. `false` if a value could not be extracted: either it was not an
+ * integer Scalar Tensor, or the value of that Scalar Tensor could not be
+ * represented by INT_T.
  */
 template <
     typename INT_T,
@@ -973,10 +1020,10 @@ bool extract_scalar_tensor(exec_aten::Tensor tensor, INT_T* out_val) {
  *
  * @param[in] tensor The source of the value to extract.
  * @param[out] out_val The extracted value, on success.
- * @returns `true` if a value was extracted, and sets `*out_val` to that value.
- *    `false` if a value could not be extracted: either it was not a floating
- *    point Scalar Tensor, or the value of that Scalar Tensor could not be
- *    represented by FLOAT_T.
+ * @returns `true` if a value was extracted, and sets `*out_val` to that
+ * value. `false` if a value could not be extracted: either it was not a
+ * floating point Scalar Tensor, or the value of that Scalar Tensor could not
+ * be represented by FLOAT_T.
  */
 template <
     typename FLOAT_T,
@@ -1076,9 +1123,9 @@ ET_NODISCARD Error resize_tensor_impl(
  * expand the tensor if new size exceeds the current capacity. Currently
  * fails an ET_CHECK if the tensor cannot be resized.
  *
- * WARNING: Placeholder API until discussion around runtime context is settled,
- * will likely move to be a class method on a TensorResizer object passed in
- * through runtimeContext.
+ * WARNING: Placeholder API until discussion around runtime context is
+ * settled, will likely move to be a class method on a TensorResizer object
+ * passed in through runtimeContext.
  */
 ET_NODISCARD inline Error resize_tensor(
     exec_aten::Tensor t,
@@ -1091,9 +1138,9 @@ ET_NODISCARD inline Error resize_tensor(
  * expand the tensor if new size exceeds the current capacity. Currently
  * fails an ET_CHECK if the tensor cannot be resized.
  *
- * WARNING: Placeholder API until discussion around runtime context is settled,
- * will likely move to be a class method on a TensorResizer object passed in
- * through runtimeContext.
+ * WARNING: Placeholder API until discussion around runtime context is
+ * settled, will likely move to be a class method on a TensorResizer object
+ * passed in through runtimeContext.
  */
 template <
     typename T,
@@ -1124,8 +1171,8 @@ ET_DEPRECATED inline void resize(
 /**
  * Get dim_order of a Tensor and write it to out_dim_order.
  * @param tensor The tensor where we want to get dim order from.
- * @param out_dim_order Pointing to an array of DimOrderType where we write dim
- * order into it.
+ * @param out_dim_order Pointing to an array of DimOrderType where we write
+ * dim order into it.
  * @param out_dim_order_size Size of the DimOrderType array.
  */
 ET_NODISCARD Error get_dim_order(
@@ -1134,18 +1181,94 @@ ET_NODISCARD Error get_dim_order(
     size_t out_dim_order_size);
 
 /**
- * Checks whether a tensor has a valid dim order. If the dim order could not be
- * determined, then this function returns false by default.
+ * Checks whether a tensor has a valid dim order. If the dim order could not
+ * be determined, then this function returns false by default.
  */
 bool tensor_has_valid_dim_order(exec_aten::Tensor t);
 
 /**
- * Checks whether a tensor has either the default of channels last dim order. If
- * the dim order could not be determined, then this function returns false by
- * default.
+ * Checks whether a tensor has either the default of channels last dim order.
+ * If the dim order could not be determined, then this function returns false
+ * by default.
  */
 bool tensor_is_default_or_channels_last_dim_order(exec_aten::Tensor t);
 
+/**
+ * Checks whether a tensor has the default dimension order.
+ * Logs an error message if the tensor does not meet the expected criteria.
+ *
+ * @param t The tensor to check the dimension order of.
+ * @return True if the tensor has the default dimension order, false otherwise.
+ */
+bool tensor_is_default_dim_order(exec_aten::Tensor t);
+
+/**
+ * Checks whether a tensor has the channels last dimension order.
+ * Logs an error message if the tensor does not meet the expected criteria.
+ *
+ * @param t The tensor to check the dimension order of.
+ * @return True if the tensor has the channels last dimension order, false
+ * otherwise.
+ */
+bool tensor_is_channels_last_dim_order(exec_aten::Tensor t);
+
+/**
+ * Asserts that four tensors have the same dim_order
+ *
+ * Note that this macro only tests dim order, but not others like actual data,
+ * sizes, etc.
+ *
+ */
+bool tensors_have_same_dim_order(
+    const exec_aten::ArrayRef<exec_aten::Tensor> tensor_list);
+
+/**
+ * Asserts that two tensors have the same dim_order
+ *
+ * Note that this macro only tests dim order, but not others like actual data,
+ * sizes, etc.
+ */
+
+inline bool tensors_have_same_dim_order(
+    const exec_aten::Tensor& a,
+    const exec_aten::Tensor& b) {
+  exec_aten::Tensor tensor_list[2] = {a, b};
+  return tensors_have_same_dim_order(tensor_list);
+}
+
+/**
+ * Asserts that three tensors have the same dim_order
+ *
+ * Note that this macro only tests dim order, but not others like actual data,
+ * sizes, etc.
+ *
+ */
+
+inline bool tensors_have_same_dim_order(
+    const exec_aten::Tensor& a,
+    const exec_aten::Tensor& b,
+    const exec_aten::Tensor& c) {
+  exec_aten::Tensor tensor_list[3] = {a, b, c};
+  return tensors_have_same_dim_order(tensor_list);
+}
+
+/**
+ * Asserts that four tensors have the same dim_order
+ *
+ * Note that this macro only tests dim order, but not others like actual data,
+ * sizes, etc.
+ *
+ */
+
+inline bool tensors_have_same_dim_order(
+    const exec_aten::Tensor& a,
+    const exec_aten::Tensor& b,
+    const exec_aten::Tensor& c,
+    const exec_aten::Tensor& d) {
+  exec_aten::Tensor tensor_list[4] = {a, b, c, d};
+  return tensors_have_same_dim_order(tensor_list);
+}
+
 /**
  * Given an n-dimensional coordinate array and an array of tensor strides,
  * calculates the linear index that can be used to retrieve the value at the
@@ -1197,6 +1320,7 @@ using ::executorch::runtime::tensor_is_bits_type;
 using ::executorch::runtime::tensor_is_bool_type;
 using ::executorch::runtime::tensor_is_complex_type;
 using ::executorch::runtime::tensor_is_contiguous;
+using ::executorch::runtime::tensor_is_default_dim_order;
 using ::executorch::runtime::tensor_is_default_or_channels_last_dim_order;
 using ::executorch::runtime::tensor_is_floating_type;
 using ::executorch::runtime::tensor_is_integral_type;
@@ -1205,6 +1329,7 @@ using ::executorch::runtime::tensor_is_real_type;
 using ::executorch::runtime::tensor_is_realh_type;
 using ::executorch::runtime::tensor_is_realhb_type;
 using ::executorch::runtime::tensor_is_scalar;
+using ::executorch::runtime::tensors_have_same_dim_order;
 using ::executorch::runtime::tensors_have_same_dtype;
 using ::executorch::runtime::tensors_have_same_rank;
 using ::executorch::runtime::tensors_have_same_shape;
diff --git a/runtime/core/exec_aten/util/tensor_util_aten.cpp b/runtime/core/exec_aten/util/tensor_util_aten.cpp
index c5ff3b52234..84af8fcee42 100644
--- a/runtime/core/exec_aten/util/tensor_util_aten.cpp
+++ b/runtime/core/exec_aten/util/tensor_util_aten.cpp
@@ -77,6 +77,46 @@ inline bool tensor_is_default_or_channels_last_dim_order(at::Tensor t) {
   return ret_val;
 }
 
+bool tensors_have_same_dim_order(
+    const exec_aten::ArrayRef<exec_aten::Tensor> tensor_list) {
+  if (tensor_list.size() < 2) {
+    return true;
+  }
+
+  exec_aten::DimOrderType first_dim_order[kTensorDimensionLimit];
+  exec_aten::DimOrderType other_dim_order[kTensorDimensionLimit];
+
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      get_dim_order(tensor_list[0], first_dim_order, tensor_list[0].dim()) ==
+          Error::Ok,
+      "Failed to retrieve dim order from 1st input tensor!");
+
+  bool all_contiguous =
+      is_contiguous_dim_order(first_dim_order, tensor_list[0].dim());
+  bool all_channels_last =
+      is_channels_last_dim_order(first_dim_order, tensor_list[0].dim());
+
+  for (size_t i = 1; i < tensor_list.size(); ++i) {
+    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+        get_dim_order(tensor_list[i], other_dim_order, tensor_list[i].dim()) ==
+            Error::Ok,
+        "Failed to retrieve dim order from %zd-th input tensor!",
+        i);
+
+    all_contiguous = all_contiguous &&
+        is_contiguous_dim_order(other_dim_order, tensor_list[i].dim());
+    all_channels_last = all_channels_last &&
+        is_channels_last_dim_order(other_dim_order, tensor_list[i].dim());
+  }
+
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      all_contiguous || all_channels_last,
+      "%zd input tensors have different dim orders",
+      tensor_list.size());
+
+  return all_contiguous || all_channels_last;
+}
+
 namespace internal {
 
 Error share_tensor_data(const at::Tensor& t_dst, const at::Tensor& t_src) {
diff --git a/runtime/core/exec_aten/util/tensor_util_portable.cpp b/runtime/core/exec_aten/util/tensor_util_portable.cpp
index c7872d1499a..b7ed92f3c97 100644
--- a/runtime/core/exec_aten/util/tensor_util_portable.cpp
+++ b/runtime/core/exec_aten/util/tensor_util_portable.cpp
@@ -73,6 +73,66 @@ bool tensor_is_default_or_channels_last_dim_order(torch::executor::Tensor t) {
   return ret_val;
 }
 
+bool tensor_is_default_dim_order(torch::executor::Tensor t) {
+  bool ret_val =
+      is_contiguous_dim_order(t.dim_order().data(), t.dim_order().size());
+
+  if (!ret_val) {
+    ET_LOG(Error, "Expected tensor to have default dim order, but got");
+    for (size_t d = 0; d < t.dim(); ++d) {
+      ET_LOG(
+          Error,
+          "    dim_order(%zu): %zu",
+          static_cast<size_t>(d),
+          static_cast<size_t>(t.dim_order()[d]));
+    }
+  }
+  return ret_val;
+}
+
+bool tensor_is_channels_last_dim_order(torch::executor::Tensor t) {
+  bool ret_val =
+      is_channels_last_dim_order(t.dim_order().data(), t.dim_order().size());
+
+  if (!ret_val) {
+    ET_LOG(Error, "Expected tensor to have channels last dim order, but got");
+    for (size_t d = 0; d < t.dim(); ++d) {
+      ET_LOG(
+          Error,
+          "    dim_order(%zu): %zu",
+          static_cast<size_t>(d),
+          static_cast<size_t>(t.dim_order()[d]));
+    }
+  }
+  return ret_val;
+}
+
+bool tensors_have_same_dim_order(
+    const exec_aten::ArrayRef<exec_aten::Tensor> tensor_list) {
+  if (tensor_list.size() < 2) {
+    return true;
+  }
+  bool all_contiguous = true;
+  bool all_channels_last = true;
+  for (size_t i = 0; i < tensor_list.size(); ++i) {
+    all_contiguous = all_contiguous &&
+        is_contiguous_dim_order(
+                         tensor_list[i].dim_order().data(),
+                         tensor_list[i].dim_order().size());
+    all_channels_last = all_channels_last &&
+        is_channels_last_dim_order(
+                            tensor_list[i].dim_order().data(),
+                            tensor_list[i].dim_order().size());
+  }
+
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      all_contiguous || all_channels_last,
+      "%zd input tensors have different dim orders",
+      tensor_list.size());
+
+  return true;
+}
+
 namespace internal {
 
 Error share_tensor_data(
diff --git a/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp b/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp
index b91c7009f45..9df01b7be9f 100644
--- a/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp
+++ b/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp
@@ -139,37 +139,38 @@ TEST(ScalarTypeUtilTest, promoteTypesTest) {
 
   // Check some common cases
 
-  ET_CHECK(
-      promoteTypes(ScalarType::Float, ScalarType::Double) ==
-      ScalarType::Double);
-  ET_CHECK(
-      promoteTypes(ScalarType::Float, ScalarType::Short) == ScalarType::Float);
-
-  ET_CHECK(
-      promoteTypes(ScalarType::Float, ScalarType::Int) == ScalarType::Float);
-  ET_CHECK(
-      promoteTypes(ScalarType::Long, ScalarType::Float) == ScalarType::Float);
-
-  ET_CHECK(
-      promoteTypes(ScalarType::Bool, ScalarType::Bool) == ScalarType::Bool);
-
-  ET_CHECK(promoteTypes(ScalarType::Byte, ScalarType::Int) == ScalarType::Int);
-  ET_CHECK(
-      promoteTypes(ScalarType::Char, ScalarType::Bool) == ScalarType::Char);
-  ET_CHECK(promoteTypes(ScalarType::Bool, ScalarType::Int) == ScalarType::Int);
+  EXPECT_EQ(
+      promoteTypes(ScalarType::Float, ScalarType::Double), ScalarType::Double);
+  EXPECT_EQ(
+      promoteTypes(ScalarType::Float, ScalarType::Short), ScalarType::Float);
+
+  EXPECT_EQ(
+      promoteTypes(ScalarType::Float, ScalarType::Int), ScalarType::Float);
+  EXPECT_EQ(
+      promoteTypes(ScalarType::Long, ScalarType::Float), ScalarType::Float);
+
+  EXPECT_EQ(promoteTypes(ScalarType::Bool, ScalarType::Bool), ScalarType::Bool);
+
+  EXPECT_EQ(promoteTypes(ScalarType::Byte, ScalarType::Int), ScalarType::Int);
+  EXPECT_EQ(promoteTypes(ScalarType::Char, ScalarType::Bool), ScalarType::Char);
+  EXPECT_EQ(promoteTypes(ScalarType::Bool, ScalarType::Int), ScalarType::Int);
+
+  EXPECT_EQ(
+      promoteTypes(ScalarType::BFloat16, ScalarType::Half), ScalarType::Float);
+  EXPECT_EQ(
+      promoteTypes(ScalarType::BFloat16, ScalarType::Bool),
+      ScalarType::BFloat16);
 }
 
 template <typename T1, typename T2>
 struct promote_types_is_valid
     : std::integral_constant<
           bool,
-          !std::is_same<T1, exec_aten::BFloat16>::value &&
-              !std::is_same<T2, exec_aten::BFloat16>::value &&
-              (std::is_same<T1, T2>::value ||
-               (!executorch::runtime::is_qint_type<T1>::value &&
-                !executorch::runtime::is_qint_type<T2>::value &&
-                !executorch::runtime::is_bits_type<T1>::value &&
-                !executorch::runtime::is_bits_type<T2>::value))> {};
+          (std::is_same<T1, T2>::value ||
+           (!executorch::runtime::is_qint_type<T1>::value &&
+            !executorch::runtime::is_qint_type<T2>::value &&
+            !executorch::runtime::is_bits_type<T1>::value &&
+            !executorch::runtime::is_bits_type<T2>::value))> {};
 
 template <typename T1, bool half_to_float>
 struct CompileTimePromoteTypesTestCase {
@@ -195,7 +196,8 @@ struct CompileTimePromoteTypesTestCase {
     auto expected = executorch::runtime::promoteTypes(
         scalarType1, scalarType2, half_to_float);
     EXPECT_EQ(actual, expected)
-        << "promoting " << (int)scalarType1 << " to " << (int)scalarType2;
+        << "promoting " << (int)scalarType1 << " to " << (int)scalarType2
+        << " (half to float: " << half_to_float << ')';
   }
 
   template <
diff --git a/runtime/core/exec_aten/util/test/targets.bzl b/runtime/core/exec_aten/util/test/targets.bzl
index cbd31013b5b..615b7c99a44 100644
--- a/runtime/core/exec_aten/util/test/targets.bzl
+++ b/runtime/core/exec_aten/util/test/targets.bzl
@@ -16,16 +16,6 @@ def define_common_targets():
         ],
     )
 
-    runtime.cxx_test(
-        name = "tensor_util_test",
-        srcs = ["tensor_util_test.cpp"],
-        deps = [
-            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
-            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
-            "//executorch/runtime/core/exec_aten/util:tensor_util",
-        ],
-    )
-
     runtime.cxx_test(
         name = "operator_impl_example_test",
         srcs = ["operator_impl_example_test.cpp"],
@@ -44,3 +34,15 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/util:tensor_util",
         ],
     )
+
+    for aten_mode in (True, False):
+        aten_suffix = "_aten" if aten_mode else ""
+        runtime.cxx_test(
+            name = "tensor_util_test" + aten_suffix,
+            srcs = ["tensor_util_test.cpp"],
+            deps = [
+                "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+                "//executorch/runtime/core/exec_aten/util:scalar_type_util",
+                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
+            ],
+        )
diff --git a/runtime/core/exec_aten/util/test/tensor_util_test.cpp b/runtime/core/exec_aten/util/test/tensor_util_test.cpp
index 53ff06966c2..88588dade68 100644
--- a/runtime/core/exec_aten/util/test/tensor_util_test.cpp
+++ b/runtime/core/exec_aten/util/test/tensor_util_test.cpp
@@ -14,8 +14,6 @@
 #include <cmath>
 #include <limits>
 
-#include <gtest/gtest.h>
-
 using namespace ::testing;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
@@ -553,3 +551,57 @@ TEST_F(TensorUtilTest, ResizeZeroDimTensor) {
       executorch::runtime::Error::Ok);
   EXPECT_EQ(a.dim(), 0);
 }
+
+TEST_F(TensorUtilTest, SameDimOrderContiguous) {
+  using namespace torch::executor;
+  // Three different tensors with the same shape and same dim order
+  // ([0, 1, 2, 3]), but different dtypes and contents.
+  std::vector<int32_t> sizes = {3, 5, 2, 1};
+  Tensor a = tf_byte_.ones(sizes);
+  Tensor b = tf_int_.zeros(sizes);
+  Tensor c = tf_float_.full(sizes, 0.1);
+
+  // The tensors have the same dim order, should pass the following checks.
+  EXPECT_TRUE(tensors_have_same_dim_order(a, b));
+  EXPECT_TRUE(tensors_have_same_dim_order(b, a));
+  EXPECT_TRUE(tensors_have_same_dim_order(a, b, c));
+  EXPECT_TRUE(tensors_have_same_dim_order(b, c, a));
+  EXPECT_TRUE(tensors_have_same_dim_order(c, a, b));
+}
+
+TEST_F(TensorUtilTest, SameDimOrderChannelsLast) {
+  using namespace torch::executor;
+  // Three different tensors with the same shape and same dim order
+  // ([0, 2, 3, 1]), but different dtypes and contents.
+  std::vector<int32_t> sizes = {3, 5, 2, 1};
+  Tensor a = tf_byte_.full_channels_last(sizes, 1);
+  Tensor b = tf_int_.full_channels_last(sizes, 0);
+  Tensor c = tf_float_.full_channels_last(sizes, 0.1);
+
+  // The tensors have the same dim order, should pass the following checks.
+  EXPECT_TRUE(tensors_have_same_dim_order(a, b));
+  EXPECT_TRUE(tensors_have_same_dim_order(b, a));
+  EXPECT_TRUE(tensors_have_same_dim_order(a, b, c));
+  EXPECT_TRUE(tensors_have_same_dim_order(b, c, a));
+  EXPECT_TRUE(tensors_have_same_dim_order(c, a, b));
+}
+
+TEST_F(TensorUtilTest, SameShapesDifferentDimOrder) {
+  using namespace torch::executor;
+  // Three different tensors with the same shape but different dtypes and
+  // contents, where b and c have the same dim order ([0, 2, 3, 1]) while a is
+  // different ([0, 1, 2, 3]).
+  std::vector<int32_t> sizes = {3, 5, 2, 1};
+  Tensor a = tf_byte_.ones(sizes);
+  Tensor b = tf_int_.full_channels_last(sizes, 0);
+  Tensor c = tf_float_.full_channels_last(sizes, 0.1);
+
+  // Not the same dim order. Chec
+  EXPECT_FALSE(tensors_have_same_dim_order(a, b));
+  EXPECT_FALSE(tensors_have_same_dim_order(b, a));
+
+  // Test with a mismatching tensor in all positions, where the other two agree.
+  EXPECT_FALSE(tensors_have_same_dim_order(a, b, c));
+  EXPECT_FALSE(tensors_have_same_dim_order(a, c, b));
+  EXPECT_FALSE(tensors_have_same_dim_order(c, b, a));
+}
diff --git a/runtime/core/portable_type/bfloat16.h b/runtime/core/portable_type/bfloat16.h
index a1ceb0c56a7..e665e6152e3 100644
--- a/runtime/core/portable_type/bfloat16.h
+++ b/runtime/core/portable_type/bfloat16.h
@@ -8,11 +8,41 @@
 
 #pragma once
 
+#include <cmath>
 #include <cstdint>
+#include <cstring>
+#include <limits>
+#include <ostream>
 
 namespace torch {
 namespace executor {
 
+namespace internal {
+inline float f32_from_bits(uint16_t src) {
+  float res = 0;
+  uint32_t tmp = src;
+  tmp <<= 16;
+  std::memcpy(&res, &tmp, sizeof(tmp));
+  return res;
+}
+
+inline uint16_t bits_from_f32(float src) {
+  uint32_t res = 0;
+  std::memcpy(&res, &src, sizeof(res));
+  return res >> 16;
+}
+
+inline uint16_t round_to_nearest_even(float src) {
+  if (std::isnan(src)) {
+    return UINT16_C(0x7FC0);
+  }
+  uint32_t U32 = 0;
+  std::memcpy(&U32, &src, sizeof(U32));
+  uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
+  return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
+}
+} // namespace internal
+
 /**
  * The "brain floating-point" type, compatible with c10/util/BFloat16.h from
  * pytorch core.
@@ -22,7 +52,288 @@ namespace executor {
  */
 struct alignas(2) BFloat16 {
   uint16_t x;
+
+  BFloat16() = default;
+  struct from_bits_t {};
+  static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  constexpr BFloat16(unsigned short bits, from_bits_t) : x(bits) {}
+  /* implicit */ BFloat16(float value)
+      : x(internal::round_to_nearest_even(value)) {}
+  operator float() const {
+    return internal::f32_from_bits(x);
+  }
 };
 
+inline std::ostream& operator<<(std::ostream& out, const BFloat16& value) {
+  out << (float)value;
+  return out;
+}
+
+/// Arithmetic
+
+inline BFloat16 operator+(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline BFloat16 operator-(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline BFloat16 operator*(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline BFloat16 operator/(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline BFloat16 operator-(const BFloat16& a) {
+  return -static_cast<float>(a);
+}
+
+inline BFloat16& operator+=(BFloat16& a, const BFloat16& b) {
+  a = a + b;
+  return a;
+}
+
+inline BFloat16& operator-=(BFloat16& a, const BFloat16& b) {
+  a = a - b;
+  return a;
+}
+
+inline BFloat16& operator*=(BFloat16& a, const BFloat16& b) {
+  a = a * b;
+  return a;
+}
+
+inline BFloat16& operator/=(BFloat16& a, const BFloat16& b) {
+  a = a / b;
+  return a;
+}
+
+inline BFloat16& operator|(BFloat16& a, const BFloat16& b) {
+  a.x = a.x | b.x;
+  return a;
+}
+
+inline BFloat16& operator^(BFloat16& a, const BFloat16& b) {
+  a.x = a.x ^ b.x;
+  return a;
+}
+
+inline BFloat16& operator&(BFloat16& a, const BFloat16& b) {
+  a.x = a.x & b.x;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline float operator+(BFloat16 a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline float operator-(BFloat16 a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline float operator*(BFloat16 a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline float operator/(BFloat16 a, float b) {
+  return static_cast<float>(a) / b;
+}
+
+inline float operator+(float a, BFloat16 b) {
+  return a + static_cast<float>(b);
+}
+inline float operator-(float a, BFloat16 b) {
+  return a - static_cast<float>(b);
+}
+inline float operator*(float a, BFloat16 b) {
+  return a * static_cast<float>(b);
+}
+inline float operator/(float a, BFloat16 b) {
+  return a / static_cast<float>(b);
+}
+
+inline float& operator+=(float& a, const BFloat16& b) {
+  return a += static_cast<float>(b);
+}
+inline float& operator-=(float& a, const BFloat16& b) {
+  return a -= static_cast<float>(b);
+}
+inline float& operator*=(float& a, const BFloat16& b) {
+  return a *= static_cast<float>(b);
+}
+inline float& operator/=(float& a, const BFloat16& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline double operator+(BFloat16 a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline double operator-(BFloat16 a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline double operator*(BFloat16 a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline double operator/(BFloat16 a, double b) {
+  return static_cast<double>(a) / b;
+}
+
+inline double operator+(double a, BFloat16 b) {
+  return a + static_cast<double>(b);
+}
+inline double operator-(double a, BFloat16 b) {
+  return a - static_cast<double>(b);
+}
+inline double operator*(double a, BFloat16 b) {
+  return a * static_cast<double>(b);
+}
+inline double operator/(double a, BFloat16 b) {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline BFloat16 operator+(BFloat16 a, int b) {
+  return a + static_cast<BFloat16>(b);
+}
+inline BFloat16 operator-(BFloat16 a, int b) {
+  return a - static_cast<BFloat16>(b);
+}
+inline BFloat16 operator*(BFloat16 a, int b) {
+  return a * static_cast<BFloat16>(b);
+}
+inline BFloat16 operator/(BFloat16 a, int b) {
+  return a / static_cast<BFloat16>(b);
+}
+
+inline BFloat16 operator+(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) + b;
+}
+inline BFloat16 operator-(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) - b;
+}
+inline BFloat16 operator*(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) * b;
+}
+inline BFloat16 operator/(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline BFloat16 operator+(BFloat16 a, int64_t b) {
+  return a + static_cast<BFloat16>(b);
+}
+inline BFloat16 operator-(BFloat16 a, int64_t b) {
+  return a - static_cast<BFloat16>(b);
+}
+inline BFloat16 operator*(BFloat16 a, int64_t b) {
+  return a * static_cast<BFloat16>(b);
+}
+inline BFloat16 operator/(BFloat16 a, int64_t b) {
+  return a / static_cast<BFloat16>(b);
+}
+
+inline BFloat16 operator+(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) + b;
+}
+inline BFloat16 operator-(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) - b;
+}
+inline BFloat16 operator*(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) * b;
+}
+inline BFloat16 operator/(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) / b;
+}
+
+// Overloading < and > operators, because std::max and std::min use them.
+
+inline bool operator>(BFloat16& lhs, BFloat16& rhs) {
+  return float(lhs) > float(rhs);
+}
+
+inline bool operator<(BFloat16& lhs, BFloat16& rhs) {
+  return float(lhs) < float(rhs);
+}
+
 } // namespace executor
 } // namespace torch
+
+namespace std {
+
+template <>
+class numeric_limits<torch::executor::BFloat16> {
+ public:
+  static constexpr bool is_signed = true;
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
+  static constexpr auto has_denorm_loss =
+      numeric_limits<float>::has_denorm_loss;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 8;
+  static constexpr int digits10 = 2;
+  static constexpr int max_digits10 = 4;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -125;
+  static constexpr int min_exponent10 = -37;
+  static constexpr int max_exponent = 128;
+  static constexpr int max_exponent10 = 38;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+
+  static constexpr torch::executor::BFloat16 min() {
+    return torch::executor::BFloat16(
+        0x0080, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 lowest() {
+    return torch::executor::BFloat16(
+        0xFF7F, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 max() {
+    return torch::executor::BFloat16(
+        0x7F7F, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 epsilon() {
+    return torch::executor::BFloat16(
+        0x3C00, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 round_error() {
+    return torch::executor::BFloat16(
+        0x3F00, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 infinity() {
+    return torch::executor::BFloat16(
+        0x7F80, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 quiet_NaN() {
+    return torch::executor::BFloat16(
+        0x7FC0, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 signaling_NaN() {
+    return torch::executor::BFloat16(
+        0x7F80, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 denorm_min() {
+    return torch::executor::BFloat16(
+        0x0001, torch::executor::BFloat16::from_bits());
+  }
+};
+
+} // namespace std
diff --git a/runtime/core/portable_type/bfloat16_math.h b/runtime/core/portable_type/bfloat16_math.h
new file mode 100644
index 00000000000..68ee77cf340
--- /dev/null
+++ b/runtime/core/portable_type/bfloat16_math.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/portable_type/bfloat16.h>
+#include <executorch/runtime/core/portable_type/half.h>
+
+namespace std {
+
+template <typename T>
+struct is_reduced_floating_point
+    : std::integral_constant<
+          bool,
+          std::is_same<T, torch::executor::BFloat16>::value ||
+              std::is_same<T, torch::executor::Half>::value> {};
+
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T acos(T a) {
+  return std::acos(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T asin(T a) {
+  return std::asin(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T atan(T a) {
+  return std::atan(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T atanh(T a) {
+  return std::atanh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T erf(T a) {
+  return std::erf(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T erfc(T a) {
+  return std::erfc(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T exp(T a) {
+  return std::exp(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T expm1(T a) {
+  return std::expm1(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline bool isfinite(T a) {
+  return std::isfinite(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T log(T a) {
+  return std::log(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T log10(T a) {
+  return std::log10(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T log1p(T a) {
+  return std::log1p(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T log2(T a) {
+  return std::log2(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T ceil(T a) {
+  return std::ceil(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T cos(T a) {
+  return std::cos(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T floor(T a) {
+  return std::floor(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T nearbyint(T a) {
+  return std::nearbyint(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T sin(T a) {
+  return std::sin(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T tan(T a) {
+  return std::tan(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T sinh(T a) {
+  return std::sinh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T cosh(T a) {
+  return std::cosh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T tanh(T a) {
+  return std::tanh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T trunc(T a) {
+  return std::trunc(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T lgamma(T a) {
+  return std::lgamma(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T sqrt(T a) {
+  return std::sqrt(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T rsqrt(T a) {
+  return 1.0 / std::sqrt(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T abs(T a) {
+  return std::abs(float(a));
+}
+#if defined(_MSC_VER) && defined(__CUDACC__)
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T pow(T a, double b) {
+  return std::pow(float(a), float(b));
+}
+#else
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T pow(T a, double b) {
+  return std::pow(float(a), b);
+}
+#endif
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T pow(T a, T b) {
+  return std::pow(float(a), float(b));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T fmod(T a, T b) {
+  return std::fmod(float(a), float(b));
+}
+
+/*
+  The following function is inspired from the implementation in `musl`
+  Link to License: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
+  ----------------------------------------------------------------------
+  Copyright © 2005-2020 Rich Felker, et al.
+
+  Permission is hereby granted, free of charge, to any person obtaining
+  a copy of this software and associated documentation files (the
+  "Software"), to deal in the Software without restriction, including
+  without limitation the rights to use, copy, modify, merge, publish,
+  distribute, sublicense, and/or sell copies of the Software, and to
+  permit persons to whom the Software is furnished to do so, subject to
+  the following conditions:
+
+  The above copyright notice and this permission notice shall be
+  included in all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  ----------------------------------------------------------------------
+ */
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T nextafter(T from, T to) {
+  // Reference:
+  // https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c
+  using int_repr_t = uint16_t;
+  constexpr uint8_t bits = 16;
+  union {
+    T f;
+    int_repr_t i;
+  } ufrom = {from}, uto = {to};
+
+  // get a mask to get the sign bit i.e. MSB
+  int_repr_t sign_mask = int_repr_t{1} << (bits - 1);
+
+  // short-circuit: if either is NaN, return NaN
+  if (from != from || to != to) {
+    return from + to;
+  }
+
+  // short-circuit: if they are exactly the same.
+  if (ufrom.i == uto.i) {
+    return from;
+  }
+
+  // mask the sign-bit to zero i.e. positive
+  // equivalent to abs(x)
+  int_repr_t abs_from = ufrom.i & ~sign_mask;
+  int_repr_t abs_to = uto.i & ~sign_mask;
+  if (abs_from == 0) {
+    // if both are zero but with different sign,
+    // preserve the sign of `to`.
+    if (abs_to == 0) {
+      return to;
+    }
+    // smallest subnormal with sign of `to`.
+    ufrom.i = (uto.i & sign_mask) | int_repr_t{1};
+    return ufrom.f;
+  }
+
+  // if abs(from) > abs(to) or sign(from) != sign(to)
+  if (abs_from > abs_to || ((ufrom.i ^ uto.i) & sign_mask)) {
+    ufrom.i--;
+  } else {
+    ufrom.i++;
+  }
+
+  return ufrom.f;
+}
+
+} // namespace std
diff --git a/runtime/core/portable_type/half.h b/runtime/core/portable_type/half.h
index 5aded68270b..8987d82804b 100644
--- a/runtime/core/portable_type/half.h
+++ b/runtime/core/portable_type/half.h
@@ -62,7 +62,7 @@ struct alignas(2) Half {
 namespace internal {
 
 inline float fp32_from_bits(uint32_t w) {
-  static_assert(sizeof(float) == sizeof(uint32_t), "");
+  static_assert(sizeof(float) == sizeof(uint32_t));
   union {
     uint32_t as_bits;
     float as_value;
@@ -71,7 +71,7 @@ inline float fp32_from_bits(uint32_t w) {
 }
 
 inline uint32_t fp32_to_bits(float f) {
-  static_assert(sizeof(float) == sizeof(uint32_t), "");
+  static_assert(sizeof(float) == sizeof(uint32_t));
   union {
     float as_value;
     uint32_t as_bits;
diff --git a/runtime/core/portable_type/scalar.h b/runtime/core/portable_type/scalar.h
index 2619f9e2614..1147fee7cc9 100644
--- a/runtime/core/portable_type/scalar.h
+++ b/runtime/core/portable_type/scalar.h
@@ -8,6 +8,8 @@
 
 #pragma once
 
+#include <executorch/runtime/core/portable_type/bfloat16.h>
+#include <executorch/runtime/core/portable_type/half.h>
 #include <executorch/runtime/core/tag.h>
 #include <executorch/runtime/platform/assert.h>
 
@@ -39,6 +41,8 @@ class Scalar {
   /*implicit*/ Scalar(double val) : tag(Tag::Double) {
     v.as_double = val;
   }
+  /*implicit*/ Scalar(BFloat16 val) : Scalar((double)(float)val) {}
+  /*implicit*/ Scalar(Half val) : Scalar((double)(float)val) {}
 
   /// Returns the concrete scalar value stored within.
   template <typename T>
diff --git a/runtime/core/portable_type/string_view.h b/runtime/core/portable_type/string_view.h
index 4036539ccc5..47a9f335eb5 100644
--- a/runtime/core/portable_type/string_view.h
+++ b/runtime/core/portable_type/string_view.h
@@ -79,13 +79,10 @@ class basic_string_view final {
   }
 
   constexpr const_reference at(size_type pos) const {
-    return (pos >= size_)
-        ? (ET_ASSERT_MESSAGE_EMIT(
-               " (%s): "
-               "string_view::operator[] or string_view::at() out of range",
-               pos >= size_),
-           torch::executor::runtime_abort())
-        : at_(pos);
+    ET_CHECK_MSG(
+        pos >= size_,
+        "string_view::operator[] or string_view::at() out of range");
+    return at_(pos);
   }
 
   constexpr const_reference front() const {
@@ -140,13 +137,9 @@ class basic_string_view final {
 
   constexpr basic_string_view substr(size_type pos = 0, size_type count = npos)
       const {
-    return (pos > size_)
-        ? (ET_ASSERT_MESSAGE_EMIT(
-               " (%s): "
-               "basic_string_view::substr parameter out of bounds.",
-               pos > size_),
-           torch::executor::runtime_abort())
-        : substr_(pos, count);
+    ET_CHECK_MSG(
+        pos > size_, "basic_string_view::substr parameter out of bounds.");
+    return substr_(pos, count);
   }
 
   constexpr int compare(basic_string_view rhs) const noexcept {
diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index 0d65ef36b85..b8ccbe602ed 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -43,6 +43,7 @@ def define_common_targets():
         name = "scalar_type",
         exported_headers = [
             "bfloat16.h",
+            "bfloat16_math.h",
             "complex.h",
             "half.h",
             "scalar_type.h",
diff --git a/runtime/core/portable_type/tensor.h b/runtime/core/portable_type/tensor.h
index 6006bddaaa6..fb42e837109 100644
--- a/runtime/core/portable_type/tensor.h
+++ b/runtime/core/portable_type/tensor.h
@@ -9,7 +9,6 @@
 #pragma once
 
 #include <executorch/runtime/platform/compiler.h>
-#include <sys/types.h> // TODO(T126923429): Include size_t, ssize_t
 
 #include <executorch/runtime/core/portable_type/tensor_impl.h>
 
@@ -86,6 +85,10 @@ class Tensor {
     return impl_->scalar_type();
   }
 
+  inline ScalarType dtype() const {
+    return scalar_type();
+  }
+
   /// Returns the size in bytes of one element of the tensor.
   ssize_t element_size() const {
     return impl_->element_size();
@@ -106,6 +109,11 @@ class Tensor {
     return impl_->strides();
   }
 
+  /// Returns the mutability of the shape of the tensor.
+  TensorShapeDynamism shape_dynamism() const {
+    return impl_->shape_dynamism();
+  }
+
   /// Returns a pointer of type T to the constant underlying data blob.
   template <typename T>
   inline const T* const_data_ptr() const {
diff --git a/runtime/core/portable_type/tensor_impl.cpp b/runtime/core/portable_type/tensor_impl.cpp
index fe6b57ea350..ad0fa5868c1 100644
--- a/runtime/core/portable_type/tensor_impl.cpp
+++ b/runtime/core/portable_type/tensor_impl.cpp
@@ -20,18 +20,24 @@
 namespace torch {
 namespace executor {
 
-namespace {
 /**
  * Compute the number of elements based on the sizes of a tensor.
  */
 ssize_t compute_numel(const TensorImpl::SizesType* sizes, ssize_t dim) {
+  ET_CHECK_MSG(
+      dim == 0 || sizes != nullptr,
+      "Sizes must be provided for non-scalar tensors");
   ssize_t numel = 1; // Zero-dimensional tensors (scalars) have numel == 1.
   for (ssize_t i = 0; i < dim; ++i) {
+    ET_CHECK_MSG(
+        sizes[i] >= 0,
+        "Size must be non-negative, got %d at dimension %zd",
+        sizes[i],
+        i);
     numel *= sizes[i];
   }
   return numel;
 }
-} // namespace
 
 TensorImpl::TensorImpl(
     ScalarType type,
@@ -52,6 +58,7 @@ TensorImpl::TensorImpl(
       shape_dynamism_(dynamism) {
   ET_CHECK_MSG(
       isValid(type_), "Invalid type %" PRId8, static_cast<int8_t>(type_));
+  ET_CHECK_MSG(dim_ >= 0, "Dimension must be non-negative, got %zd", dim_);
 }
 
 size_t TensorImpl::nbytes() const {
diff --git a/runtime/core/portable_type/tensor_impl.h b/runtime/core/portable_type/tensor_impl.h
index 09ee744ae7f..c48149cd187 100644
--- a/runtime/core/portable_type/tensor_impl.h
+++ b/runtime/core/portable_type/tensor_impl.h
@@ -8,8 +8,6 @@
 
 #pragma once
 
-#include <sys/types.h> // TODO(T126923429): Include size_t, ssize_t
-
 #include <executorch/runtime/core/array_ref.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/portable_type/scalar_type.h>
@@ -150,6 +148,10 @@ class TensorImpl {
     return type_;
   }
 
+  inline ScalarType dtype() const {
+    return scalar_type();
+  }
+
   /// Returns the size in bytes of one element of the tensor.
   ssize_t element_size() const;
 
@@ -168,6 +170,11 @@ class TensorImpl {
     return ArrayRef<StridesType>{strides_, static_cast<size_t>(dim_)};
   }
 
+  /// Returns the mutability of the shape of the tensor.
+  TensorShapeDynamism shape_dynamism() const {
+    return shape_dynamism_;
+  }
+
   /// Returns a pointer of type T to the constant underlying data blob.
   template <typename T>
   inline const T* data() const {
@@ -255,5 +262,20 @@ class TensorImpl {
   const TensorShapeDynamism shape_dynamism_;
 };
 
+/**
+ * Compute the number of elements based on the sizes of a tensor.
+ */
+ssize_t compute_numel(
+    const ::torch::executor::TensorImpl::SizesType* sizes,
+    ssize_t dim);
+
 } // namespace executor
 } // namespace torch
+
+namespace executorch {
+namespace runtime {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using torch::executor::compute_numel;
+} // namespace runtime
+} // namespace executorch
diff --git a/runtime/core/portable_type/test/CMakeLists.txt b/runtime/core/portable_type/test/CMakeLists.txt
index 73d877d68bb..58a69f656eb 100644
--- a/runtime/core/portable_type/test/CMakeLists.txt
+++ b/runtime/core/portable_type/test/CMakeLists.txt
@@ -23,8 +23,8 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 
 include(${EXECUTORCH_ROOT}/build/Test.cmake)
 
-set(_test_srcs optional_test.cpp executor_tensor_test.cpp half_test.cpp
-               scalar_test.cpp tensor_impl_test.cpp
+set(_test_srcs optional_test.cpp tensor_test.cpp half_test.cpp scalar_test.cpp
+               tensor_impl_test.cpp bfloat16_test.cpp
 )
 
 et_cxx_test(runtime_core_portable_type_test SOURCES ${_test_srcs} EXTRA_LIBS)
diff --git a/runtime/core/portable_type/test/bfloat16_test.cpp b/runtime/core/portable_type/test/bfloat16_test.cpp
new file mode 100644
index 00000000000..9ea53e6cba2
--- /dev/null
+++ b/runtime/core/portable_type/test/bfloat16_test.cpp
@@ -0,0 +1,191 @@
+#include <executorch/runtime/core/portable_type/bfloat16.h>
+
+#include <gtest/gtest.h>
+
+using torch::executor::BFloat16;
+
+namespace {
+float float_from_bytes(uint32_t sign, uint32_t exponent, uint32_t fraction) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  uint32_t bytes;
+  bytes = 0;
+  bytes |= sign;
+  bytes <<= 8;
+  bytes |= exponent;
+  bytes <<= 23;
+  bytes |= fraction;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  float res;
+  std::memcpy(&res, &bytes, sizeof(res));
+  return res;
+}
+
+TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
+  float in[100];
+  for (int i = 0; i < 100; ++i) {
+    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
+    in[i] = i + 1.25;
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
+  BFloat16 bfloats[100];
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
+  float out[100];
+
+  for (int i = 0; i < 100; ++i) {
+    bfloats[i].x = torch::executor::internal::bits_from_f32(in[i]);
+    out[i] = torch::executor::internal::f32_from_bits(bfloats[i].x);
+
+    // The relative error should be less than 1/(2^7) since BFloat16
+    // has 7 bits mantissa.
+    EXPECT_LE(std::fabs(out[i] - in[i]) / in[i], 1.0 / 128);
+  }
+}
+
+TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
+  float in[100];
+  for (int i = 0; i < 100; ++i) {
+    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
+    in[i] = i + 1.25;
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
+  BFloat16 bfloats[100];
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
+  float out[100];
+
+  for (int i = 0; i < 100; ++i) {
+    bfloats[i].x = torch::executor::internal::round_to_nearest_even(in[i]);
+    out[i] = torch::executor::internal::f32_from_bits(bfloats[i].x);
+
+    // The relative error should be less than 1/(2^7) since BFloat16
+    // has 7 bits mantissa.
+    EXPECT_LE(std::fabs(out[i] - in[i]) / in[i], 1.0 / 128);
+  }
+}
+
+TEST(BFloat16Conversion, NaN) {
+  float inNaN = float_from_bytes(0, 0xFF, 0x7FFFFF);
+  EXPECT_TRUE(std::isnan(inNaN));
+
+  BFloat16 a = BFloat16(inNaN);
+  float out = torch::executor::internal::f32_from_bits(a.x);
+
+  EXPECT_TRUE(std::isnan(out));
+}
+
+TEST(BFloat16Conversion, Inf) {
+  float inInf = float_from_bytes(0, 0xFF, 0);
+  EXPECT_TRUE(std::isinf(inInf));
+
+  BFloat16 a = BFloat16(inInf);
+  float out = torch::executor::internal::f32_from_bits(a.x);
+
+  EXPECT_TRUE(std::isinf(out));
+}
+
+TEST(BFloat16Conversion, SmallestDenormal) {
+  float in = std::numeric_limits<float>::denorm_min(); // The smallest non-zero
+                                                       // subnormal number
+  BFloat16 a = BFloat16(in);
+  float out = torch::executor::internal::f32_from_bits(a.x);
+
+  EXPECT_FLOAT_EQ(in, out);
+}
+
+TEST(BFloat16Math, Addition) {
+  // This test verifies that if only first 7 bits of float's mantissa are
+  // changed after addition, we should have no loss in precision.
+
+  // input bits
+  // S | Exponent | Mantissa
+  // 0 | 10000000 | 10010000000000000000000 = 3.125
+  float input = float_from_bytes(0, 0, 0x40480000);
+
+  // expected bits
+  // S | Exponent | Mantissa
+  // 0 | 10000001 | 10010000000000000000000 = 6.25
+  float expected = float_from_bytes(0, 0, 0x40c80000);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  BFloat16 b;
+  b.x = torch::executor::internal::bits_from_f32(input);
+  b = b + b;
+
+  float res = torch::executor::internal::f32_from_bits(b.x);
+  EXPECT_EQ(res, expected);
+}
+
+TEST(BFloat16Math, Subtraction) {
+  // This test verifies that if only first 7 bits of float's mantissa are
+  // changed after subtraction, we should have no loss in precision.
+
+  // input bits
+  // S | Exponent | Mantissa
+  // 0 | 10000001 | 11101000000000000000000 = 7.625
+  float input = float_from_bytes(0, 0, 0x40f40000);
+
+  // expected bits
+  // S | Exponent | Mantissa
+  // 0 | 10000000 | 01010000000000000000000 = 2.625
+  float expected = float_from_bytes(0, 0, 0x40280000);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  BFloat16 b;
+  b.x = torch::executor::internal::bits_from_f32(input);
+  b = b - 5;
+
+  float res = torch::executor::internal::f32_from_bits(b.x);
+  EXPECT_EQ(res, expected);
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+TEST(BFloat16Math, NextAfterZero) {
+  const BFloat16 zero{0};
+
+  auto check_nextafter = [](BFloat16 from, BFloat16 to, BFloat16 expected) {
+    BFloat16 actual = std::nextafter(from, to);
+    // Check for bitwise equality!
+    ASSERT_EQ(actual.x ^ expected.x, uint16_t{0});
+  };
+  check_nextafter(zero, zero, /*expected=*/zero);
+  check_nextafter(zero, -zero, /*expected=*/-zero);
+  check_nextafter(-zero, zero, /*expected=*/zero);
+  check_nextafter(-zero, -zero, /*expected=*/-zero);
+}
+
+float BinaryToFloat(uint32_t bytes) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  float res;
+  std::memcpy(&res, &bytes, sizeof(res));
+  return res;
+}
+
+struct BFloat16TestParam {
+  uint32_t input;
+  uint16_t rne;
+};
+
+class BFloat16Test : public ::testing::Test,
+                     public ::testing::WithParamInterface<BFloat16TestParam> {};
+
+TEST_P(BFloat16Test, BFloat16RNETest) {
+  float value = BinaryToFloat(GetParam().input);
+  uint16_t rounded = torch::executor::internal::round_to_nearest_even(value);
+  EXPECT_EQ(GetParam().rne, rounded);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BFloat16TestInstantiation,
+    BFloat16Test,
+    ::testing::Values(
+        BFloat16TestParam{0x3F848000, 0x3F84},
+        BFloat16TestParam{0x3F848010, 0x3F85},
+        BFloat16TestParam{0x3F850000, 0x3F85},
+        BFloat16TestParam{0x3F858000, 0x3F86},
+        BFloat16TestParam{0x3FFF8000, 0x4000}));
+
+} // namespace
diff --git a/runtime/core/portable_type/test/targets.bzl b/runtime/core/portable_type/test/targets.bzl
index 15e002d451c..c0b4ef00c78 100644
--- a/runtime/core/portable_type/test/targets.bzl
+++ b/runtime/core/portable_type/test/targets.bzl
@@ -6,6 +6,14 @@ def define_common_targets():
     The directory containing this targets.bzl file should also contain both
     TARGETS and BUCK files that call this function.
     """
+    runtime.cxx_test(
+        name = "bfloat16_test",
+        srcs = ["bfloat16_test.cpp"],
+        deps = [
+            "//executorch/runtime/core/portable_type:portable_type",
+        ],
+    )
+
     runtime.cxx_test(
         name = "optional_test",
         srcs = ["optional_test.cpp"],
@@ -15,8 +23,8 @@ def define_common_targets():
     )
 
     runtime.cxx_test(
-        name = "executor_tensor_test",
-        srcs = ["executor_tensor_test.cpp"],
+        name = "tensor_test",
+        srcs = ["tensor_test.cpp"],
         deps = [
             "//executorch/runtime/core/portable_type:portable_type",
         ],
diff --git a/runtime/core/portable_type/test/tensor_impl_test.cpp b/runtime/core/portable_type/test/tensor_impl_test.cpp
index 9e8e9d2a433..77dd01ea23f 100644
--- a/runtime/core/portable_type/test/tensor_impl_test.cpp
+++ b/runtime/core/portable_type/test/tensor_impl_test.cpp
@@ -8,12 +8,13 @@
 
 #include <executorch/runtime/core/portable_type/tensor_impl.h>
 
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-#include <executorch/runtime/platform/runtime.h>
-
 #include <gtest/gtest.h>
 #include <random>
 
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/test/utils/DeathTest.h>
+
 using namespace ::testing;
 
 namespace torch {
@@ -29,7 +30,7 @@ class TensorImplTest : public ::testing::Test {
   void SetUp() override {
     // Since these tests cause ET_LOG to be called, the PAL must be initialized
     // first.
-    torch::executor::runtime_init();
+    runtime_init();
   }
 };
 
@@ -370,5 +371,81 @@ TEST_F(TensorImplTest, TestWriteRead) {
   EXPECT_EQ(y[0], 22.0);
 }
 
+TEST_F(TensorImplTest, TestInvalidScalarType) {
+  SizesType sizes[2] = {3, 2};
+  ET_EXPECT_DEATH(TensorImpl t(static_cast<ScalarType>(-1), 2, sizes), "");
+}
+
+TEST_F(TensorImplTest, TestNegativeDimension) {
+  SizesType sizes[2] = {3, 2};
+  ET_EXPECT_DEATH(TensorImpl t(ScalarType::Float, -1, sizes), "");
+}
+
+TEST_F(TensorImplTest, TestNullSizesNonZeroDim) {
+  ET_EXPECT_DEATH(TensorImpl t(ScalarType::Float, 2, nullptr), "");
+}
+
+TEST_F(TensorImplTest, TestNonNegativeSizes) {
+  SizesType sizes[2] = {3, -2};
+  ET_EXPECT_DEATH(TensorImpl t(ScalarType::Float, 2, sizes), "");
+}
+
+TEST_F(TensorImplTest, TestEmptyTensor) {
+  SizesType sizes[2] = {0, 0};
+  TensorImpl t(ScalarType::Float, 2, sizes);
+  EXPECT_EQ(t.numel(), 0);
+  EXPECT_EQ(t.data(), nullptr);
+}
+
+TEST_F(TensorImplTest, TestTensorWithNoElementsButAllocatedMemory) {
+  SizesType sizes[2] = {0, 0};
+  float data[1] = {1.0};
+  TensorImpl t(ScalarType::Float, 2, sizes, data);
+  EXPECT_EQ(t.numel(), 0);
+  EXPECT_EQ(t.data(), data);
+}
+
+TEST_F(TensorImplTest, TestTensorWithShapeButNoMemory) {
+  SizesType sizes[2] = {3, 2};
+  TensorImpl t(ScalarType::Float, 2, sizes);
+  EXPECT_GT(t.numel(), 0);
+  EXPECT_EQ(t.data(), nullptr);
+}
+
+TEST_F(TensorImplTest, TestNormalTensor) {
+  SizesType sizes[2] = {3, 2};
+  float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  TensorImpl t(ScalarType::Float, 2, sizes, data);
+  EXPECT_GT(t.numel(), 0);
+  EXPECT_EQ(t.data(), data);
+}
+
+TEST_F(TensorImplTest, TestResizingTensorToZeroAndBack) {
+  SizesType sizes[2] = {3, 2};
+  TensorImpl t(
+      ScalarType::Float,
+      2,
+      sizes,
+      nullptr,
+      nullptr,
+      nullptr,
+      TensorShapeDynamism::DYNAMIC_BOUND);
+
+  float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  t.set_data(data);
+  EXPECT_GT(t.numel(), 0);
+  EXPECT_EQ(t.data(), data);
+
+  SizesType zero_sizes[2] = {0, 0};
+  t.set_sizes_contiguous({zero_sizes, 2});
+  EXPECT_EQ(t.numel(), 0);
+  EXPECT_EQ(t.data(), data);
+
+  SizesType new_sizes[2] = {3, 2};
+  t.set_sizes_contiguous({new_sizes, 2});
+  EXPECT_GT(t.numel(), 0);
+  EXPECT_EQ(t.data(), data);
+}
+
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/test/executor_tensor_test.cpp b/runtime/core/portable_type/test/tensor_test.cpp
similarity index 84%
rename from runtime/core/portable_type/test/executor_tensor_test.cpp
rename to runtime/core/portable_type/test/tensor_test.cpp
index 944850ca8a0..7a772cd0769 100644
--- a/runtime/core/portable_type/test/executor_tensor_test.cpp
+++ b/runtime/core/portable_type/test/tensor_test.cpp
@@ -7,13 +7,25 @@
  */
 
 #include <executorch/runtime/core/portable_type/tensor.h>
-#include <executorch/test/utils/DeathTest.h>
+
 #include <gtest/gtest.h>
 
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/test/utils/DeathTest.h>
+
 namespace torch {
 namespace executor {
 
-TEST(TensorTest, InvalidScalarType) {
+class TensorTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    runtime_init();
+  }
+};
+
+TEST_F(TensorTest, InvalidScalarType) {
   TensorImpl::SizesType sizes[1] = {1};
 
   // Undefined, which is sort of a special case since it's not part of the
@@ -28,7 +40,7 @@ TEST(TensorTest, InvalidScalarType) {
   ET_EXPECT_DEATH({ TensorImpl y(static_cast<ScalarType>(-1), 1, sizes); }, "");
 }
 
-TEST(TensorTest, SetData) {
+TEST_F(TensorTest, SetData) {
   TensorImpl::SizesType sizes[1] = {5};
   TensorImpl::DimOrderType dim_order[1] = {0};
   int32_t data[5] = {0, 0, 1, 0, 0};
@@ -39,7 +51,7 @@ TEST(TensorTest, SetData) {
   EXPECT_EQ(a.const_data_ptr(), nullptr);
 }
 
-TEST(TensorTest, Strides) {
+TEST_F(TensorTest, Strides) {
   TensorImpl::SizesType sizes[2] = {2, 2};
   TensorImpl::DimOrderType dim_order[2] = {0, 1};
   int32_t data[4] = {0, 0, 1, 1};
@@ -53,7 +65,7 @@ TEST(TensorTest, Strides) {
   EXPECT_EQ(a.const_data_ptr<int32_t>()[0 + a.strides()[0]], 1);
 }
 
-TEST(TensorTest, ModifyDataOfConstTensor) {
+TEST_F(TensorTest, ModifyDataOfConstTensor) {
   TensorImpl::SizesType sizes[1] = {1};
   TensorImpl::DimOrderType dim_order[2] = {0};
   int32_t data[1] = {1};
diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl
index 4978e73169a..7e0aeb5d28c 100644
--- a/runtime/core/targets.bzl
+++ b/runtime/core/targets.bzl
@@ -15,7 +15,7 @@ def build_sdk():
 def get_sdk_flags():
     sdk_flags = []
     if build_sdk():
-        sdk_flags += ["-DEXECUTORCH_BUILD_SDK"]
+        sdk_flags += ["-DEXECUTORCH_BUILD_DEVTOOLS"]
     return sdk_flags
 
 def define_common_targets():
diff --git a/runtime/core/test/evalue_test.cpp b/runtime/core/test/evalue_test.cpp
index bc3e3a7913b..4c08695dc4b 100644
--- a/runtime/core/test/evalue_test.cpp
+++ b/runtime/core/test/evalue_test.cpp
@@ -6,21 +6,67 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/runtime/core/evalue.h>
+
 #include <gtest/gtest.h>
 
-#include <executorch/runtime/core/evalue.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/platform/runtime.h>
 #include <executorch/test/utils/DeathTest.h>
 
 using namespace ::testing;
+
+namespace torch {
+namespace executor {
+
 using exec_aten::ScalarType;
 using executorch::runtime::BoxedEvalueList;
 using executorch::runtime::EValue;
 using executorch::runtime::Tag;
 using executorch::runtime::testing::TensorFactory;
 
-TEST(TestEValue, CopyTrivialType) {
+class EValueTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    runtime_init();
+  }
+};
+
+// An utility class used in tests to simulate objects that manage Tensors.
+// The overloaded operator*() is used to return the underlying Tensor, mimicking
+// behavior of smart pointers.
+class TensorWrapper {
+ public:
+  explicit TensorWrapper(exec_aten::Tensor tensor)
+      : tensor_(std::make_unique<exec_aten::Tensor>(std::move(tensor))) {}
+
+  exec_aten::Tensor& operator*() {
+    return *tensor_;
+  }
+
+  const exec_aten::Tensor& operator*() const {
+    return *tensor_;
+  }
+
+  operator bool() const {
+    return static_cast<bool>(tensor_);
+  }
+
+  bool operator==(std::nullptr_t) const {
+    return tensor_ == nullptr;
+  }
+
+  bool operator!=(std::nullptr_t) const {
+    return tensor_ != nullptr;
+  }
+
+ private:
+  std::unique_ptr<exec_aten::Tensor> tensor_;
+};
+
+TEST_F(EValueTest, CopyTrivialType) {
   EValue a;
   EValue b(true);
   EXPECT_TRUE(a.isNone());
@@ -30,7 +76,7 @@ TEST(TestEValue, CopyTrivialType) {
   EXPECT_EQ(b.to<bool>(), true);
 }
 
-TEST(TestEValue, CopyTensor) {
+TEST_F(EValueTest, CopyTensor) {
   TensorFactory<ScalarType::Float> tf;
   EValue a(tf.ones({3, 2}));
   EValue b(tf.ones({1}));
@@ -39,7 +85,7 @@ TEST(TestEValue, CopyTensor) {
   EXPECT_EQ(a.toTensor().dim(), 1);
 }
 
-TEST(TestEValue, TypeMismatchFatals) {
+TEST_F(EValueTest, TypeMismatchFatals) {
   ET_EXPECT_DEATH(
       {
         auto e = EValue(true);
@@ -48,12 +94,12 @@ TEST(TestEValue, TypeMismatchFatals) {
       "");
 }
 
-TEST(TestEValue, NoneByDefault) {
+TEST_F(EValueTest, NoneByDefault) {
   EValue e;
   EXPECT_TRUE(e.isNone());
 }
 
-TEST(TestEValue, ToOptionalInt) {
+TEST_F(EValueTest, ToOptionalInt) {
   EValue e((int64_t)5);
   EXPECT_TRUE(e.isInt());
   EXPECT_FALSE(e.isNone());
@@ -63,7 +109,7 @@ TEST(TestEValue, ToOptionalInt) {
   EXPECT_EQ(o.value(), 5);
 }
 
-TEST(TestEValue, NoneToOptionalInt) {
+TEST_F(EValueTest, NoneToOptionalInt) {
   EValue e;
   EXPECT_TRUE(e.isNone());
 
@@ -71,7 +117,7 @@ TEST(TestEValue, NoneToOptionalInt) {
   EXPECT_FALSE(o.has_value());
 }
 
-TEST(TestEValue, ToOptionalScalar) {
+TEST_F(EValueTest, ToOptionalScalar) {
   exec_aten::Scalar s((double)3.141);
   EValue e(s);
   EXPECT_TRUE(e.isScalar());
@@ -83,7 +129,7 @@ TEST(TestEValue, ToOptionalScalar) {
   EXPECT_EQ(o.value().to<double>(), 3.141);
 }
 
-TEST(TESTEValue, ScalarToType) {
+TEST_F(EValueTest, ScalarToType) {
   exec_aten::Scalar s_d((double)3.141);
   EXPECT_EQ(s_d.to<double>(), 3.141);
   exec_aten::Scalar s_i((int64_t)3);
@@ -92,7 +138,7 @@ TEST(TESTEValue, ScalarToType) {
   EXPECT_EQ(s_b.to<bool>(), true);
 }
 
-TEST(TestEValue, NoneToOptionalScalar) {
+TEST_F(EValueTest, NoneToOptionalScalar) {
   EValue e;
   EXPECT_TRUE(e.isNone());
 
@@ -100,7 +146,7 @@ TEST(TestEValue, NoneToOptionalScalar) {
   EXPECT_FALSE(o.has_value());
 }
 
-TEST(TestEValue, NoneToOptionalTensor) {
+TEST_F(EValueTest, NoneToOptionalTensor) {
   EValue e;
   EXPECT_TRUE(e.isNone());
 
@@ -108,7 +154,7 @@ TEST(TestEValue, NoneToOptionalTensor) {
   EXPECT_FALSE(o.has_value());
 }
 
-TEST(TestEValue, ToScalarType) {
+TEST_F(EValueTest, ToScalarType) {
   EValue e((int64_t)4);
   auto o = e.toScalarType();
   EXPECT_EQ(o, exec_aten::ScalarType::Long);
@@ -118,7 +164,7 @@ TEST(TestEValue, ToScalarType) {
   EXPECT_EQ(o2.value(), exec_aten::ScalarType::Long);
 }
 
-TEST(TestEValue, toString) {
+TEST_F(EValueTest, toString) {
   const EValue e("foo", 3);
   EXPECT_TRUE(e.isString());
   EXPECT_FALSE(e.isNone());
@@ -127,28 +173,28 @@ TEST(TestEValue, toString) {
   EXPECT_EQ(x, "foo");
 }
 
-TEST(TestEValue, MemoryFormat) {
+TEST_F(EValueTest, MemoryFormat) {
   const EValue e((int64_t)0);
   EXPECT_TRUE(e.isInt());
   const exec_aten::MemoryFormat m = e.to<exec_aten::MemoryFormat>();
   EXPECT_EQ(m, exec_aten::MemoryFormat::Contiguous);
 }
 
-TEST(TestEValue, Layout) {
+TEST_F(EValueTest, Layout) {
   const EValue e((int64_t)0);
   EXPECT_TRUE(e.isInt());
   const exec_aten::Layout l = e.to<exec_aten::Layout>();
   EXPECT_EQ(l, exec_aten::Layout::Strided);
 }
 
-TEST(TestEValue, Device) {
+TEST_F(EValueTest, Device) {
   const EValue e((int64_t)0);
   EXPECT_TRUE(e.isInt());
   const exec_aten::Device d = e.to<exec_aten::Device>();
   EXPECT_TRUE(d.is_cpu());
 }
 
-TEST(TestEValue, BoxedEvalueList) {
+TEST_F(EValueTest, BoxedEvalueList) {
   // create fake values table to point to
   EValue values[3] = {
       EValue((int64_t)1), EValue((int64_t)2), EValue((int64_t)3)};
@@ -164,7 +210,7 @@ TEST(TestEValue, BoxedEvalueList) {
   EXPECT_EQ(unwrapped[2], 3);
 }
 
-TEST(TestEValue, toOptionalTensorList) {
+TEST_F(EValueTest, toOptionalTensorList) {
   // create list, empty evalue ctor gets tag::None
   EValue values[2] = {EValue(), EValue()};
   EValue* values_p[2] = {&values[0], &values[1]};
@@ -185,3 +231,51 @@ TEST(TestEValue, toOptionalTensorList) {
   EXPECT_FALSE(x[0].has_value());
   EXPECT_FALSE(x[1].has_value());
 }
+
+TEST_F(EValueTest, ConstructFromUniquePtr) {
+  TensorFactory<ScalarType::Float> tf;
+  auto tensor_ptr = std::make_unique<exec_aten::Tensor>(tf.ones({2, 3}));
+
+  EValue evalue(std::move(tensor_ptr));
+
+  EXPECT_TRUE(evalue.isTensor());
+  EXPECT_EQ(evalue.toTensor().dim(), 2);
+  EXPECT_EQ(evalue.toTensor().numel(), 6);
+
+  EValue evalue2(std::make_unique<exec_aten::Tensor>(tf.ones({4, 5})));
+
+  EXPECT_TRUE(evalue2.isTensor());
+  EXPECT_EQ(evalue2.toTensor().dim(), 2);
+  EXPECT_EQ(evalue2.toTensor().numel(), 20);
+}
+
+TEST_F(EValueTest, ConstructFromSharedPtr) {
+  TensorFactory<ScalarType::Float> tf;
+  auto tensor_ptr = std::make_shared<exec_aten::Tensor>(tf.ones({4, 5}));
+
+  EValue evalue(tensor_ptr);
+
+  EXPECT_TRUE(evalue.isTensor());
+  EXPECT_EQ(evalue.toTensor().dim(), 2);
+  EXPECT_EQ(evalue.toTensor().numel(), 20);
+}
+
+TEST_F(EValueTest, ConstructFromTensorWrapper) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorWrapper tensor_wrapper(tf.ones({4, 5}));
+
+  EValue evalue(tensor_wrapper);
+
+  EXPECT_TRUE(evalue.isTensor());
+  EXPECT_EQ(evalue.toTensor().dim(), 2);
+  EXPECT_EQ(evalue.toTensor().numel(), 20);
+}
+
+TEST_F(EValueTest, ConstructFromNullPtrAborts) {
+  std::unique_ptr<exec_aten::Tensor> null_ptr;
+
+  ET_EXPECT_DEATH({ EValue evalue(null_ptr); }, "");
+}
+
+} // namespace executor
+} // namespace torch
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index 717f6fb7f72..bddcdc3173e 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -17,6 +17,7 @@
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/executor/memory_manager.h>
+#include <executorch/runtime/executor/platform_memory_allocator.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/executor/tensor_parser.h>
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
@@ -29,6 +30,8 @@
 namespace executorch {
 namespace runtime {
 
+using internal::PlatformMemoryAllocator;
+
 /**
  * Runtime state for a backend delegate.
  */
@@ -58,7 +61,7 @@ class BackendDelegate final {
     ET_CHECK_OR_RETURN_ERROR(
         delegate.id() != nullptr, InvalidProgram, "Missing backend id");
     const char* backend_id = delegate.id()->c_str();
-    PyTorchBackendInterface* backend = get_backend_class(backend_id);
+    BackendInterface* backend = get_backend_class(backend_id);
     ET_CHECK_OR_RETURN_ERROR(
         backend != nullptr,
         NotFound,
@@ -198,7 +201,7 @@ class BackendDelegate final {
   }
 
   FreeableBuffer segment_;
-  const PyTorchBackendInterface* backend_;
+  const BackendInterface* backend_;
   DelegateHandle* handle_;
 };
 
@@ -527,19 +530,20 @@ Error Method::resolve_operator(
           i,
           static_cast<uint32_t>(err));
       meta[count].dim_order_ =
-          ArrayRef<exec_aten::DimOrderType>(dim_order_ptr, size);
+          Span<exec_aten::DimOrderType>(dim_order_ptr, size);
       count++;
     }
   }
-  // search kernel
-  if (hasOpsFn(operator_name, ArrayRef<TensorMeta>(meta, count))) {
-    kernels[kernel_index] =
-        getOpsFn(operator_name, ArrayRef<TensorMeta>(meta, count));
-    return Error::Ok;
-  } else {
+
+  // Find a kernel with the matching name and tensor meta.
+  Result<OpFunction> op_function =
+      get_op_function_from_registry(operator_name, {meta, count});
+  if (!op_function.ok()) {
     ET_LOG(Error, "Missing operator: [%d] %s", op_index, operator_name);
-    return Error::OperatorMissing;
+    return op_function.error();
   }
+  kernels[kernel_index] = op_function.get();
+  return Error::Ok;
 }
 
 Result<Method> Method::load(
@@ -547,7 +551,16 @@ Result<Method> Method::load(
     const Program* program,
     MemoryManager* memory_manager,
     EventTracer* event_tracer) {
-  Method method(program, memory_manager, event_tracer);
+  MemoryAllocator* temp_allocator = memory_manager->temp_allocator();
+  if (temp_allocator == nullptr) {
+    PlatformMemoryAllocator* platform_allocator =
+        ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
+            memory_manager->method_allocator(), PlatformMemoryAllocator);
+    new (platform_allocator) PlatformMemoryAllocator();
+    temp_allocator = platform_allocator;
+  }
+  Method method(program, memory_manager, event_tracer, temp_allocator);
+
   Error err = method.init(s_plan);
   if (err != Error::Ok) {
     return err;
@@ -731,40 +744,6 @@ Error Method::init(executorch_flatbuffer::ExecutionPlan* s_plan) {
     }
   }
 
-  // Validate input values and get tensor pre-allocation info.
-  pre_allocated_input_ = false;
-  for (int i = 0; i < inputs_size(); i++) {
-    // get_input() will panic if the index is invalid, so do this manually.
-    size_t index = get_input_index(i);
-    ET_CHECK_OR_RETURN_ERROR(
-        index < n_value_,
-        InvalidProgram,
-        "Input index %zu >= %zu",
-        index,
-        n_value_);
-    const EValue& input = values_[index];
-    if (input.isTensor()) {
-      pre_allocated_input_ |= input.toTensor().const_data_ptr() != nullptr;
-    }
-  }
-
-  // Validate output values and get tensor pre-allocation info.
-  pre_allocated_output_ = false;
-  for (int i = 0; i < outputs_size(); i++) {
-    // get_output() will panic if the index is invalid, so do this manually.
-    size_t index = get_output_index(i);
-    ET_CHECK_OR_RETURN_ERROR(
-        index < n_value_,
-        InvalidProgram,
-        "output index %zu >= %zu",
-        index,
-        n_value_);
-    const EValue& output = values_[index];
-    if (output.isTensor()) {
-      pre_allocated_output_ |= output.toTensor().const_data_ptr() != nullptr;
-    }
-  }
-
   step_state_ = StepState{0, 0};
 
   init_state_ = InitializationState::Initialized;
@@ -828,7 +807,8 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
         input_idx,
         static_cast<uint32_t>(err));
     Error error;
-    if (pre_allocated_input_) {
+    auto tensor_meta = this->method_meta().input_tensor_meta(input_idx);
+    if (tensor_meta->is_memory_planned()) {
       error = internal::copy_tensor_data(t_dst, t_src);
     } else {
       error = internal::share_tensor_data(t_dst, t_src);
@@ -937,21 +917,11 @@ Method::set_output_data_ptr(void* buffer, size_t size, size_t output_idx) {
       InvalidState,
       "Outputs can not be retrieved until method has been initialized.");
 
-  // ET_CHECK_OR_RETURN_ERROR(
-  //     !pre_allocated_output_,
-  //     InvalidState,
-  //     "Overriding output data pointer allocated by memory plan is not
-  //     allowed.");
-  // TODO(T188740925): for now, return error without logs.
-  if (pre_allocated_output_) {
-    return Error::InvalidState;
-  }
-
   // Check the args
   ET_CHECK_OR_RETURN_ERROR(
-      output_idx <= outputs_size(),
+      output_idx < outputs_size(),
       InvalidArgument,
-      "output_idx: %zu num_outputs: %zu",
+      "output_idx: %zu > num_outputs: %zu",
       output_idx,
       outputs_size());
 
@@ -962,6 +932,16 @@ Method::set_output_data_ptr(void* buffer, size_t size, size_t output_idx) {
       "output type: %zu is not tensor",
       (size_t)output.tag);
 
+  auto tensor_meta = this->method_meta().output_tensor_meta(output_idx);
+  if (tensor_meta->is_memory_planned()) {
+    ET_LOG(
+        Error,
+        "Output %zu is memory planned, or is a constant. Cannot override \
+        the existing data pointer.",
+        output_idx);
+    return Error::InvalidState;
+  }
+
   auto& t = output.toTensor();
   ET_CHECK_OR_RETURN_ERROR(
       output.isTensor(),
@@ -1038,16 +1018,14 @@ Error Method::execute_instruction() {
   auto instruction = instructions->Get(step_state_.instr_idx);
   size_t next_instr_idx = step_state_.instr_idx + 1;
   Error err = Error::Ok;
+
   switch (instruction->instr_args_type()) {
     case executorch_flatbuffer::InstructionArguments::KernelCall: {
       EXECUTORCH_SCOPE_PROF("OPERATOR_CALL");
       internal::EventTracerProfileScope event_tracer_scope =
           internal::EventTracerProfileScope(event_tracer_, "OPERATOR_CALL");
       // TODO(T147221312): Also expose tensor resizer via the context.
-      // The temp_allocator passed can be null, but calling allocate_temp will
-      // fail
-      KernelRuntimeContext context(
-          event_tracer_, memory_manager_->temp_allocator());
+      KernelRuntimeContext context(event_tracer_, temp_allocator_);
       auto args = chain.argument_lists_[step_state_.instr_idx];
       chain.kernels_[step_state_.instr_idx](context, args.data());
       // We reset the temp_allocator after the switch statement
@@ -1095,7 +1073,7 @@ Error Method::execute_instruction() {
           step_state_.instr_idx);
       BackendExecutionContext backend_execution_context(
           /*event_tracer*/ event_tracer_,
-          /*temp_allocator*/ memory_manager_->temp_allocator());
+          /*temp_allocator*/ temp_allocator_);
       err = delegates_[delegate_idx].Execute(
           backend_execution_context,
           chain.argument_lists_[step_state_.instr_idx].data());
@@ -1167,8 +1145,8 @@ Error Method::execute_instruction() {
       err = Error::InvalidProgram;
   }
   // Reset the temp allocator for every instruction.
-  if (memory_manager_->temp_allocator() != nullptr) {
-    memory_manager_->temp_allocator()->reset();
+  if (temp_allocator_ != nullptr) {
+    temp_allocator_->reset();
   }
   if (err == Error::Ok) {
     step_state_.instr_idx = next_instr_idx;
diff --git a/runtime/executor/method.h b/runtime/executor/method.h
index 7d96096accf..66e3c96d292 100644
--- a/runtime/executor/method.h
+++ b/runtime/executor/method.h
@@ -53,6 +53,7 @@ class Method final {
       : step_state_(rhs.step_state_),
         program_(rhs.program_),
         memory_manager_(rhs.memory_manager_),
+        temp_allocator_(rhs.temp_allocator_),
         serialization_plan_(rhs.serialization_plan_),
         event_tracer_(rhs.event_tracer_),
         n_value_(rhs.n_value_),
@@ -61,9 +62,7 @@ class Method final {
         delegates_(rhs.delegates_),
         n_chains_(rhs.n_chains_),
         chains_(rhs.chains_),
-        init_state_(rhs.init_state_),
-        pre_allocated_input_(rhs.pre_allocated_input_),
-        pre_allocated_output_(rhs.pre_allocated_output_) {
+        init_state_(rhs.init_state_) {
     // Required: clear out fields that the dtor looks at, so that we don't free
     // anything twice.
     rhs.n_value_ = 0;
@@ -81,8 +80,6 @@ class Method final {
     rhs.event_tracer_ = nullptr;
     rhs.n_chains_ = 0;
     rhs.chains_ = nullptr;
-    rhs.pre_allocated_input_ = false;
-    rhs.pre_allocated_output_ = false;
   }
 
   /**
@@ -273,10 +270,12 @@ class Method final {
   Method(
       const Program* program,
       MemoryManager* memory_manager,
-      EventTracer* event_tracer)
+      EventTracer* event_tracer,
+      MemoryAllocator* temp_allocator)
       : step_state_(),
         program_(program),
         memory_manager_(memory_manager),
+        temp_allocator_(temp_allocator),
         serialization_plan_(nullptr),
         event_tracer_(event_tracer),
         n_value_(0),
@@ -285,9 +284,7 @@ class Method final {
         delegates_(nullptr),
         n_chains_(0),
         chains_(nullptr),
-        init_state_(InitializationState::Uninitialized),
-        pre_allocated_input_(false),
-        pre_allocated_output_(false) {}
+        init_state_(InitializationState::Uninitialized) {}
 
   /// Static factory used by Program.
   ET_NODISCARD static Result<Method> load(
@@ -319,6 +316,7 @@ class Method final {
   StepState step_state_;
   const Program* program_;
   MemoryManager* memory_manager_;
+  MemoryAllocator* temp_allocator_;
   executorch_flatbuffer::ExecutionPlan* serialization_plan_;
   EventTracer* event_tracer_;
 
@@ -332,8 +330,6 @@ class Method final {
   Chain* chains_;
 
   InitializationState init_state_;
-  bool pre_allocated_input_;
-  bool pre_allocated_output_;
 
   /**
    * Parses the elements of the values_ array. On error, n_value_ will be set to
diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp
index c6a855358d6..5acf055a89f 100644
--- a/runtime/executor/method_meta.cpp
+++ b/runtime/executor/method_meta.cpp
@@ -68,10 +68,12 @@ size_t calculate_nbytes(
 TensorInfo::TensorInfo(
     Span<const int32_t> sizes,
     Span<const uint8_t> dim_order,
-    exec_aten::ScalarType scalar_type)
+    exec_aten::ScalarType scalar_type,
+    const bool is_memory_planned)
     : sizes_(sizes),
       dim_order_(dim_order),
       scalar_type_(scalar_type),
+      is_memory_planned_(is_memory_planned),
       nbytes_(calculate_nbytes(sizes_, scalar_type_)) {}
 
 Span<const int32_t> TensorInfo::sizes() const {
@@ -86,6 +88,10 @@ exec_aten::ScalarType TensorInfo::scalar_type() const {
   return scalar_type_;
 }
 
+bool TensorInfo::is_memory_planned() const {
+  return is_memory_planned_;
+}
+
 size_t TensorInfo::nbytes() const {
   return nbytes_;
 }
@@ -132,7 +138,10 @@ Result<TensorInfo> MethodMeta::input_tensor_meta(size_t index) const {
           tensor_value->sizes()->data(), tensor_value->sizes()->size()),
       Span<const uint8_t>(
           tensor_value->dim_order()->data(), tensor_value->dim_order()->size()),
-      static_cast<exec_aten::ScalarType>(tensor_value->scalar_type()));
+      static_cast<exec_aten::ScalarType>(tensor_value->scalar_type()),
+      tensor_value->allocation_info() != nullptr ||
+          tensor_value->data_buffer_idx() !=
+              0); // Count constant returns as memory planned.
 }
 
 size_t MethodMeta::num_outputs() const {
@@ -163,14 +172,18 @@ Result<TensorInfo> MethodMeta::output_tensor_meta(size_t index) const {
       "Tag: %zu output: %zu is not Tensor",
       (size_t)tag.get(),
       index);
-  auto input_index = s_plan_->outputs()->Get(index);
-  auto tensor_value = s_plan_->values()->Get(input_index)->val_as_Tensor();
+  auto output_index = s_plan_->outputs()->Get(index);
+  auto tensor_value = s_plan_->values()->Get(output_index)->val_as_Tensor();
+
   return TensorInfo(
       Span<const int32_t>(
           tensor_value->sizes()->data(), tensor_value->sizes()->size()),
       Span<const uint8_t>(
           tensor_value->dim_order()->data(), tensor_value->dim_order()->size()),
-      static_cast<exec_aten::ScalarType>(tensor_value->scalar_type()));
+      static_cast<exec_aten::ScalarType>(tensor_value->scalar_type()),
+      tensor_value->allocation_info() != nullptr ||
+          tensor_value->data_buffer_idx() !=
+              0); // Count constant returns as memory planned.
 }
 
 size_t MethodMeta::num_memory_planned_buffers() const {
diff --git a/runtime/executor/method_meta.h b/runtime/executor/method_meta.h
index c67b9d268e0..7817583fc3c 100644
--- a/runtime/executor/method_meta.h
+++ b/runtime/executor/method_meta.h
@@ -52,6 +52,11 @@ class TensorInfo final {
    */
   exec_aten::ScalarType scalar_type() const;
 
+  /**
+   * Returns whether the tensor's memory was planned during export.
+   */
+  bool is_memory_planned() const;
+
   /**
    * Returns the size of the tensor in bytes.
    */
@@ -64,7 +69,8 @@ class TensorInfo final {
   TensorInfo(
       Span<const int32_t> sizes,
       Span<const uint8_t> dim_order,
-      exec_aten::ScalarType scalar_type);
+      exec_aten::ScalarType scalar_type,
+      const bool is_memory_planned);
 
   /**
    * The sizes of the tensor.
@@ -85,6 +91,9 @@ class TensorInfo final {
   /// The scalar type of the tensor.
   exec_aten::ScalarType scalar_type_;
 
+  /// Whether the tensor's memory was planned during export.
+  bool is_memory_planned_;
+
   /// The size in bytes of the tensor.
   size_t nbytes_;
 };
diff --git a/runtime/executor/platform_memory_allocator.h b/runtime/executor/platform_memory_allocator.h
new file mode 100644
index 00000000000..09195a460ac
--- /dev/null
+++ b/runtime/executor/platform_memory_allocator.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <cinttypes>
+#include <cstdint>
+
+#include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/platform.h>
+
+namespace executorch {
+namespace runtime {
+namespace internal {
+
+/**
+ * PlatformMemoryAllocator is a memory allocator that uses a linked list to
+ * manage allocated nodes. It overrides the allocate method of MemoryAllocator
+ * using the PAL fallback allocator method `et_pal_allocate`.
+ */
+class PlatformMemoryAllocator final : public MemoryAllocator {
+ private:
+  // We allocate a little more than requested and use that memory as a node in
+  // a linked list, pushing the allocated buffers onto a list that's iterated
+  // and freed when the KernelRuntimeContext is destroyed.
+  struct AllocationNode {
+    void* data;
+    AllocationNode* next;
+  };
+
+  AllocationNode* head_ = nullptr;
+
+ public:
+  PlatformMemoryAllocator() : MemoryAllocator(0, nullptr) {}
+
+  void* allocate(size_t size, size_t alignment = kDefaultAlignment) override {
+    if (!isPowerOf2(alignment)) {
+      ET_LOG(Error, "Alignment %zu is not a power of 2", alignment);
+      return nullptr;
+    }
+
+    // Allocate enough memory for the node, the data and the alignment bump.
+    size_t alloc_size = sizeof(AllocationNode) + size + alignment;
+    void* node_memory = et_pal_allocate(alloc_size);
+
+    // If allocation failed, log message and return nullptr.
+    if (node_memory == nullptr) {
+      ET_LOG(Error, "Failed to allocate %zu bytes", alloc_size);
+      return nullptr;
+    }
+
+    // Compute data pointer.
+    uint8_t* data_ptr =
+        reinterpret_cast<uint8_t*>(node_memory) + sizeof(AllocationNode);
+
+    // Align the data pointer.
+    void* aligned_data_ptr = alignPointer(data_ptr, alignment);
+
+    // Assert that the alignment didn't overflow the allocated memory.
+    ET_DCHECK_MSG(
+        reinterpret_cast<uintptr_t>(aligned_data_ptr) + size <=
+            reinterpret_cast<uintptr_t>(node_memory) + alloc_size,
+        "aligned_data_ptr %p + size %zu > node_memory %p + alloc_size %zu",
+        aligned_data_ptr,
+        size,
+        node_memory,
+        alloc_size);
+
+    // Construct the node.
+    AllocationNode* new_node = reinterpret_cast<AllocationNode*>(node_memory);
+    new_node->data = aligned_data_ptr;
+    new_node->next = head_;
+    head_ = new_node;
+
+    // Return the aligned data pointer.
+    return head_->data;
+  }
+
+  void reset() override {
+    AllocationNode* current = head_;
+    while (current != nullptr) {
+      AllocationNode* next = current->next;
+      et_pal_free(current);
+      current = next;
+    }
+    head_ = nullptr;
+  }
+
+  ~PlatformMemoryAllocator() override {
+    reset();
+  }
+
+ private:
+  // Disable copy and move.
+  PlatformMemoryAllocator(const PlatformMemoryAllocator&) = delete;
+  PlatformMemoryAllocator& operator=(const PlatformMemoryAllocator&) = delete;
+  PlatformMemoryAllocator(PlatformMemoryAllocator&&) noexcept = delete;
+  PlatformMemoryAllocator& operator=(PlatformMemoryAllocator&&) noexcept =
+      delete;
+};
+
+} // namespace internal
+} // namespace runtime
+} // namespace executorch
diff --git a/runtime/executor/program.cpp b/runtime/executor/program.cpp
index 6a889625c6a..48d36602d33 100644
--- a/runtime/executor/program.cpp
+++ b/runtime/executor/program.cpp
@@ -150,9 +150,12 @@ Result<executorch_flatbuffer::ExecutionPlan*> get_execution_plan(
 
   // Constant data may live inside the flatbuffer data (constant_buffer) or in a
   // separate segment (constant_segment). It should not be in both.
+  // Check constant_segment->offsets()->size() > 1, as the offsets list will
+  // always contain a placeholder value 0 for non-const tensors. If this is the
+  // only offset, the constant segment is empty and does not need to be loaded.
   const auto* constant_segment = flatbuffer_program->constant_segment();
   if (constant_segment != nullptr && constant_segment->offsets() != nullptr &&
-      constant_segment->offsets()->size() > 0) {
+      constant_segment->offsets()->size() > 1) {
     // The constant data is inside a separate segment.
     const auto* constant_buffer = flatbuffer_program->constant_buffer();
     ET_CHECK_OR_RETURN_ERROR(
diff --git a/runtime/executor/program.h b/runtime/executor/program.h
index a599cc958e0..f7469eb2192 100644
--- a/runtime/executor/program.h
+++ b/runtime/executor/program.h
@@ -123,7 +123,8 @@ class Program final {
    *
    * @param[in] method_name The name of the method to load.
    * @param[in] memory_manager The allocators to use during initialization and
-   *     execution of the loaded method.
+   *     execution of the loaded method. If `memory_manager.temp_allocator()` is
+   *     null, the runtime will allocate temp memory using `et_pal_allocate()`.
    * @param[in] event_tracer The event tracer to use for this method run.
    *
    * @returns The loaded method on success, or an error on failure.
diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl
index 46f997a80ad..cc91255d7b5 100644
--- a/runtime/executor/targets.bzl
+++ b/runtime/executor/targets.bzl
@@ -65,6 +65,9 @@ def define_common_targets():
                 "tensor_parser_exec_aten.cpp",
                 "tensor_parser{}.cpp".format(aten_suffix if aten_mode else "_portable"),
             ],
+            headers = [
+                "platform_memory_allocator.h",
+            ],
             exported_headers = [
                 "method.h",
                 "method_meta.h",
diff --git a/runtime/executor/test/allocation_failure_stress_test.cpp b/runtime/executor/test/allocation_failure_stress_test.cpp
index 750ecd0a1b8..9e0c857b933 100644
--- a/runtime/executor/test/allocation_failure_stress_test.cpp
+++ b/runtime/executor/test/allocation_failure_stress_test.cpp
@@ -11,12 +11,12 @@
 #include <memory>
 
 #include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/executor/test/managed_memory_manager.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/util/util.h>
 
 #include <gtest/gtest.h>
 
@@ -24,6 +24,8 @@ using namespace ::testing;
 using exec_aten::ArrayRef;
 using exec_aten::Scalar;
 using exec_aten::Tensor;
+using executorch::extension::FileDataLoader;
+using executorch::extension::prepare_input_tensors;
 using executorch::runtime::Error;
 using executorch::runtime::MemoryAllocator;
 using executorch::runtime::MemoryManager;
@@ -31,7 +33,6 @@ using executorch::runtime::Method;
 using executorch::runtime::Program;
 using executorch::runtime::Result;
 using executorch::runtime::testing::ManagedMemoryManager;
-using torch::executor::util::FileDataLoader;
 
 constexpr size_t kDefaultNonConstMemBytes = 32 * 1024U;
 constexpr size_t kDefaultRuntimeMemBytes = 32 * 1024U;
@@ -85,10 +86,9 @@ TEST_F(AllocationFailureStressTest, End2EndIncreaseRuntimeMemUntilSuccess) {
 
     // Execution does not use the runtime allocator, so it should always succeed
     // once load was successful.
-    exec_aten::ArrayRef<void*> inputs =
-        torch::executor::util::PrepareInputTensors(*method);
+    auto input_cleanup = prepare_input_tensors(*method);
+    ASSERT_EQ(input_cleanup.error(), Error::Ok);
     err = method->execute();
-    torch::executor::util::FreeInputs(inputs);
     ASSERT_EQ(err, Error::Ok);
   }
   EXPECT_GT(num_load_failures, 0) << "Expected at least some failures";
@@ -121,10 +121,9 @@ TEST_F(AllocationFailureStressTest, End2EndNonConstantMemUntilSuccess) {
 
     // Execution does not use the runtime allocator, so it should always succeed
     // once load was successful.
-    exec_aten::ArrayRef<void*> inputs =
-        torch::executor::util::PrepareInputTensors(*method);
+    auto input_cleanup = prepare_input_tensors(*method);
+    ASSERT_EQ(input_cleanup.error(), Error::Ok);
     err = method->execute();
-    torch::executor::util::FreeInputs(inputs);
     ASSERT_EQ(err, Error::Ok);
   }
   EXPECT_GT(num_load_failures, 0) << "Expected at least some failures";
diff --git a/runtime/executor/test/backend_integration_test.cpp b/runtime/executor/test/backend_integration_test.cpp
index 2445b659431..9180d77aa35 100644
--- a/runtime/executor/test/backend_integration_test.cpp
+++ b/runtime/executor/test/backend_integration_test.cpp
@@ -14,6 +14,7 @@
 
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
@@ -23,7 +24,6 @@
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/test/utils/DeathTest.h>
 #include <executorch/test/utils/alignment.h>
-#include <executorch/util/util.h>
 
 #include <gtest/gtest.h>
 
@@ -31,6 +31,7 @@ using namespace ::testing;
 using exec_aten::ArrayRef;
 using executorch::runtime::BackendExecutionContext;
 using executorch::runtime::BackendInitContext;
+using executorch::runtime::BackendInterface;
 using executorch::runtime::CompileSpec;
 using executorch::runtime::DataLoader;
 using executorch::runtime::DelegateHandle;
@@ -40,7 +41,6 @@ using executorch::runtime::FreeableBuffer;
 using executorch::runtime::MemoryAllocator;
 using executorch::runtime::Method;
 using executorch::runtime::Program;
-using executorch::runtime::PyTorchBackendInterface;
 using executorch::runtime::Result;
 using executorch::runtime::testing::ManagedMemoryManager;
 using torch::executor::util::FileDataLoader;
@@ -48,9 +48,9 @@ using torch::executor::util::FileDataLoader;
 /**
  * A backend class whose methods can be overridden individually.
  */
-class StubBackend final : public PyTorchBackendInterface {
+class StubBackend final : public BackendInterface {
  public:
-  // Function signature types that match the PyTorchBackendInterface methods.
+  // Function signature types that match the BackendInterface methods.
   using IsAvailableFn = std::function<bool()>;
   using InitFn = std::function<Result<DelegateHandle*>(
       FreeableBuffer*,
@@ -325,7 +325,7 @@ class BackendIntegrationTest : public ::testing::TestWithParam<bool> {
 };
 
 TEST_P(BackendIntegrationTest, BackendIsPresent) {
-  PyTorchBackendInterface* backend =
+  BackendInterface* backend =
       executorch::runtime::get_backend_class(StubBackend::kName);
   ASSERT_EQ(backend, &StubBackend::singleton());
 }
@@ -454,10 +454,9 @@ TEST_P(BackendIntegrationTest, EndToEndTestWithProcessedAsHandle) {
     EXPECT_FALSE(spy_loader.WasFreed(init_processed->data()));
     auto method(std::move(method_res.get()));
     // Execute the model.
-    exec_aten::ArrayRef<void*> inputs =
-        torch::executor::util::PrepareInputTensors(method);
+    auto input_cleanup = executorch::extension::prepare_input_tensors(method);
+    ASSERT_EQ(input_cleanup.error(), Error::Ok);
     auto err = method.execute();
-    torch::executor::util::FreeInputs(inputs);
     EXPECT_EQ(err, Error::Ok);
 
     // Check that the processed buffer was passed to execute() as the handle.
diff --git a/runtime/executor/test/executor_test.cpp b/runtime/executor/test/executor_test.cpp
index 74da48d7320..15b3982297c 100644
--- a/runtime/executor/test/executor_test.cpp
+++ b/runtime/executor/test/executor_test.cpp
@@ -24,13 +24,17 @@ using exec_aten::SizesType;
 using exec_aten::Tensor;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
-using executorch::runtime::getOpsFn;
-using executorch::runtime::hasOpsFn;
+using executorch::runtime::get_op_function_from_registry;
 using executorch::runtime::Kernel;
 using executorch::runtime::KernelRuntimeContext;
-using executorch::runtime::register_kernels;
+using executorch::runtime::OpFunction;
+using executorch::runtime::register_kernel;
+using executorch::runtime::registry_has_op_function;
+using executorch::runtime::Result;
 using executorch::runtime::testing::TensorFactory;
 
+namespace pytree = ::executorch::extension::pytree;
+
 class ExecutorTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -85,9 +89,9 @@ TEST_F(ExecutorTest, TensorHalf) {
 
 TEST_F(ExecutorTest, RegistryLookupAndCall) {
   const char* op_name = "aten::add.out";
-  ASSERT_TRUE(hasOpsFn(op_name));
-  auto func = getOpsFn(op_name);
-  ASSERT_TRUE(func);
+  Result<OpFunction> func = get_op_function_from_registry(op_name);
+  ASSERT_EQ(func.error(), Error::Ok);
+  ASSERT_NE(*func, nullptr);
 
   TensorFactory<ScalarType::Int> tf;
   constexpr size_t num_evalues = 4;
@@ -106,7 +110,7 @@ TEST_F(ExecutorTest, RegistryLookupAndCall) {
   kernel_args[4] = &evalues[3];
 
   KernelRuntimeContext context{};
-  func(context, kernel_args);
+  (*func)(context, kernel_args);
   auto c_ptr = evalues[3].toTensor().const_data_ptr<int32_t>();
   ASSERT_EQ(c_ptr[3], 12);
 }
@@ -164,15 +168,15 @@ TEST_F(ExecutorTest, EValueToScalar) {
 void test_op(KernelRuntimeContext& /*unused*/, EValue** /*unused*/) {}
 
 TEST_F(ExecutorTest, OpRegistration) {
-  auto s1 = register_kernels({Kernel("test", test_op)});
-  auto s2 = register_kernels({Kernel("test_2", test_op)});
+  auto s1 = register_kernel(Kernel("test", test_op));
+  auto s2 = register_kernel(Kernel("test_2", test_op));
   ASSERT_EQ(Error::Ok, s1);
   ASSERT_EQ(Error::Ok, s2);
   ET_EXPECT_DEATH(
-      []() { (void)register_kernels({Kernel("test", test_op)}); }(), "");
+      []() { (void)register_kernel(Kernel("test", test_op)); }(), "");
 
-  ASSERT_TRUE(hasOpsFn("test"));
-  ASSERT_TRUE(hasOpsFn("test_2"));
+  ASSERT_TRUE(registry_has_op_function("test"));
+  ASSERT_TRUE(registry_has_op_function("test_2"));
 }
 
 TEST_F(ExecutorTest, OpRegistrationWithContext) {
@@ -182,25 +186,27 @@ TEST_F(ExecutorTest, OpRegistrationWithContext) {
         (void)context;
         *(values[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({op});
+  auto s1 = register_kernel(op);
   ASSERT_EQ(Error::Ok, s1);
-  ASSERT_TRUE(hasOpsFn("test_op_with_context"));
 
-  auto func = getOpsFn("test_op_with_context");
+  Result<OpFunction> func =
+      get_op_function_from_registry("test_op_with_context");
+  ASSERT_EQ(func.error(), Error::Ok);
+
   EValue values[1];
   values[0] = Scalar(0);
   EValue* kernels[1];
   kernels[0] = &values[0];
   KernelRuntimeContext context{};
-  func(context, kernels);
+  (*func)(context, kernels);
 
   auto val = values[0].toScalar().to<int64_t>();
   ASSERT_EQ(val, 100);
 }
 
 TEST_F(ExecutorTest, AddMulAlreadyRegistered) {
-  ASSERT_TRUE(hasOpsFn("aten::add.out"));
-  ASSERT_TRUE(hasOpsFn("aten::mul.out"));
+  ASSERT_TRUE(registry_has_op_function("aten::add.out"));
+  ASSERT_TRUE(registry_has_op_function("aten::mul.out"));
 }
 
 TEST(PyTreeEValue, List) {
@@ -210,7 +216,7 @@ TEST(PyTreeEValue, List) {
   Scalar d((double)3.0);
   EValue items[2] = {i, d};
 
-  auto c = torch::executor::pytree::unflatten(spec, items);
+  auto c = pytree::unflatten(spec, items);
   ASSERT_TRUE(c.isList());
   ASSERT_EQ(c.size(), 2);
 
@@ -232,7 +238,7 @@ TEST(PyTreeEValue, List) {
 
 auto unflatten(EValue* items) {
   std::string spec = "D4#1#1#1#1('key0':$,1:$,23:$,123:$)";
-  return torch::executor::pytree::unflatten(spec, items);
+  return pytree::unflatten(spec, items);
 }
 
 TEST(PyTreeEValue, DestructedSpec) {
@@ -249,8 +255,8 @@ TEST(PyTreeEValue, DestructedSpec) {
   auto& key0 = c.key(0);
   auto& key1 = c.key(1);
 
-  ASSERT_TRUE(key0 == torch::executor::pytree::Key("key0"));
-  ASSERT_TRUE(key1 == torch::executor::pytree::Key(1));
+  ASSERT_TRUE(key0 == pytree::Key("key0"));
+  ASSERT_TRUE(key1 == pytree::Key(1));
 
   const auto& child0 = c[0];
   const auto& child1 = c[1];
diff --git a/runtime/executor/test/kernel_integration_test.cpp b/runtime/executor/test/kernel_integration_test.cpp
index 83ade9f7ff4..4f1ac0240b9 100644
--- a/runtime/executor/test/kernel_integration_test.cpp
+++ b/runtime/executor/test/kernel_integration_test.cpp
@@ -13,6 +13,7 @@
 #include <memory>
 
 #include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/executor/method.h>
@@ -22,7 +23,6 @@
 #include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/platform/compiler.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/util/util.h>
 
 #include <gtest/gtest.h>
 
@@ -34,6 +34,7 @@ using executorch::runtime::FreeableBuffer;
 using executorch::runtime::Kernel;
 using executorch::runtime::KernelKey;
 using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::MemoryAllocator;
 using executorch::runtime::Method;
 using executorch::runtime::Program;
 using executorch::runtime::Result;
@@ -59,10 +60,26 @@ struct KernelControl {
   // returning.
   Error fail_value = Error::Ok;
 
+  // If true, the kernel should allocate temporary memory.
+  bool allocate_temp_memory = false;
+
+  // If true, the kernel should simulate allocating temporary memory.
+  bool simulate_temp_memory_allocation = false;
+
+  // The size of the temporary memory to allocate.
+  int temp_memory_size = 0;
+
+  // The total size of all allocations.
+  int total_allocated_size = 0;
+
   void reset() {
     call_count = 0;
     call_context_fail = false;
     fail_value = Error::Ok;
+    allocate_temp_memory = false;
+    simulate_temp_memory_allocation = false;
+    temp_memory_size = 0;
+    total_allocated_size = 0;
   }
 
   /**
@@ -94,7 +111,7 @@ struct KernelControl {
         executorch::runtime::KernelKey("v1/6;0,1|6;0,1|6;0,1|6;0,1");
     Kernel kernel = executorch::runtime::Kernel(
         "aten::add.out", key, KernelControl::kernel_hook);
-    Error err = executorch::runtime::register_kernels({kernel});
+    Error err = executorch::runtime::register_kernel(kernel);
     EXPECT_EQ(err, Error::Ok);
 
     registered_ = true;
@@ -117,6 +134,33 @@ struct KernelControl {
     if (control->call_context_fail) {
       context.fail(control->fail_value);
     }
+
+    // Allocate temporary memory.
+    if (control->allocate_temp_memory) {
+      Result<void*> temp_mem_res =
+          context.allocate_temp(control->temp_memory_size);
+      if (temp_mem_res.ok()) {
+        control->total_allocated_size += control->temp_memory_size;
+        // We actually use the memory, to test default memory allocation was
+        // successful.
+        uint8_t* array = (uint8_t*)(temp_mem_res.get());
+        for (int i = 0; i < control->temp_memory_size; i++) {
+          array[i] = i % 256;
+        }
+      }
+    }
+
+    // Simulate allocating temporary memory. We use this, for testing that when
+    // a temp allocator is provided, the kernel will use it, instead of
+    // allocating memory with the default platform memory allocator.
+    // The provided TempMemoryAllocator class in this file, simulates allocating
+    // memory instead of actually allocating anything.
+    if (control->simulate_temp_memory_allocation) {
+      Result<void*> temp_mem_res =
+          context.allocate_temp(control->temp_memory_size);
+      control->total_allocated_size += control->temp_memory_size;
+      EXPECT_EQ(temp_mem_res.error(), Error::Ok);
+    }
   }
 
   static bool registered_;
@@ -126,6 +170,44 @@ struct KernelControl {
 bool KernelControl::registered_ = false;
 KernelControl KernelControl::singleton_;
 
+/**
+ * MemoryAllocator that keeps track of the number/sizes of its allocations,
+ * to test the case where the user provides a temp allocator.
+ */
+class TempMemoryAllocator final : public MemoryAllocator {
+ public:
+  TempMemoryAllocator() : MemoryAllocator(0, nullptr) {}
+
+  // The number of times allocate() has been called.
+  int number_of_allocations = 0;
+
+  // The number of times reset() has been called.
+  int number_of_resets = 0;
+
+  // The amount of memory currently allocated (should go to 0 when reset is
+  // called).
+  int currently_allocated_size = 0;
+
+  // The total size of all allocations.
+  int total_allocated_size = 0;
+
+  void* allocate(size_t size, ET_UNUSED size_t alignment = kDefaultAlignment)
+      override {
+    number_of_allocations += 1;
+    currently_allocated_size += size;
+    total_allocated_size += size;
+    // This is a simulation, we don't actually allocate memory. But we need to
+    // return a non-null pointer, so we return a bad, non-zero address that will
+    // crash if anyone tries to dereference it.
+    return (void*)1;
+  }
+
+  void reset() override {
+    number_of_resets += 1;
+    currently_allocated_size = 0;
+  }
+};
+
 class KernelIntegrationTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -152,18 +234,23 @@ class KernelIntegrationTest : public ::testing::Test {
 
     // Load the forward method.
     mmm_ = std::make_unique<ManagedMemoryManager>(
-        kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
+        kDefaultNonConstMemBytes,
+        kDefaultRuntimeMemBytes,
+        temp_allocator_.get());
     Result<Method> method = program_->load_method("forward", &mmm_->get());
     ASSERT_EQ(method.error(), Error::Ok);
     method_ = std::make_unique<Method>(std::move(method.get()));
 
     // Set up its inputs.
-    inputs_ = torch::executor::util::PrepareInputTensors(*method_);
+    auto inputs_cleanup =
+        executorch::extension::prepare_input_tensors(*method_);
+    ASSERT_EQ(inputs_cleanup.error(), Error::Ok);
+    inputs_cleanup_ = std::make_unique<executorch::extension::BufferCleanup>(
+        std::move(*inputs_cleanup));
   }
 
   void TearDown() override {
-    torch::executor::util::FreeInputs(inputs_);
-    inputs_ = {};
+    inputs_cleanup_.reset();
   }
 
  private:
@@ -173,7 +260,7 @@ class KernelIntegrationTest : public ::testing::Test {
   // Must outlive method_
   std::unique_ptr<Program> program_;
   std::unique_ptr<ManagedMemoryManager> mmm_;
-  ArrayRef<void*> inputs_;
+  std::unique_ptr<executorch::extension::BufferCleanup> inputs_cleanup_;
 
  protected:
   // An executable method that will call the kernel associated with control_.
@@ -182,6 +269,19 @@ class KernelIntegrationTest : public ::testing::Test {
 
   // The KernelControl associated with method_.
   KernelControl* control_;
+
+  // The temp memory allocator provided by the user. By default, none is
+  // provided.
+  std::unique_ptr<TempMemoryAllocator> temp_allocator_ = nullptr;
+};
+
+class KernelTempMemoryAllocatorIntegrationTest : public KernelIntegrationTest {
+ protected:
+  void SetUp() override {
+    // Create a temp allocator for the test before calling the parent SetUp.
+    temp_allocator_ = std::make_unique<TempMemoryAllocator>();
+    KernelIntegrationTest::SetUp();
+  }
 };
 
 TEST_F(KernelIntegrationTest, KernelHookIsCalled) {
@@ -219,3 +319,63 @@ TEST_F(KernelIntegrationTest, FailurePropagates) {
   EXPECT_EQ(err, Error::Ok);
   EXPECT_EQ(control_->call_count, 3);
 }
+
+TEST_F(KernelIntegrationTest, DefaultPlatformMemoryAllocator) {
+  // Tell the kernel to allocate memory. Since no temp allocator is provided,
+  // this will allocate memory using the default platform memory allocator.
+  control_->allocate_temp_memory = true;
+
+  control_->temp_memory_size = 4;
+  // This is not a simulation. This actually allocates memory, using the
+  // default platform memory allocator.
+  Error err = method_->execute();
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(control_->call_count, 1);
+  EXPECT_EQ(control_->total_allocated_size, 4);
+
+  control_->temp_memory_size = 8;
+  // This is not a simulation. This actually allocates memory, using the
+  // default platform memory allocator.
+  err = method_->execute();
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(control_->call_count, 2);
+  EXPECT_EQ(control_->total_allocated_size, 12);
+}
+
+TEST_F(KernelTempMemoryAllocatorIntegrationTest, UsingTempMemoryAllocator) {
+  // In this test we provide a temp allocator to the method, and tell the kernel
+  // to allocate memory using it. We want to make sure that the kernel uses the
+  // temp allocator, and that the temp allocator is reset after the execution.
+  // Since we are testing that the kernel uses the temp allocator, and not the
+  // temp allocator itself, we don't need to test the actual allocation of
+  // memory. Therefore, we set simulate_temp_memory_allocation to true, so that
+  // the kernel will not actually allocate memory, but will instead simulate
+  // allocating memory.
+  // The provided TempMemoryAllocator, simulates allocating memory by increasing
+  // total_allocated_size and currently_allocated_size by the requested size.
+  // We simulate resetting the allocator by setting currently_allocated_size
+  // back to 0.
+  control_->simulate_temp_memory_allocation = true;
+
+  control_->temp_memory_size = 4;
+  Error err = method_->execute();
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(control_->call_count, 1);
+  EXPECT_EQ(control_->total_allocated_size, 4);
+  EXPECT_EQ(temp_allocator_->number_of_allocations, 1);
+  EXPECT_EQ(temp_allocator_->total_allocated_size, 4);
+  // The temp allocator should have been reset after the execution.
+  EXPECT_EQ(temp_allocator_->number_of_resets, 1);
+  EXPECT_EQ(temp_allocator_->currently_allocated_size, 0);
+
+  control_->temp_memory_size = 8;
+  err = method_->execute();
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(control_->call_count, 2);
+  EXPECT_EQ(control_->total_allocated_size, 12);
+  EXPECT_EQ(temp_allocator_->number_of_allocations, 2);
+  EXPECT_EQ(temp_allocator_->total_allocated_size, 12);
+  // The temp allocator should have been reset after the execution.
+  EXPECT_EQ(temp_allocator_->number_of_resets, 2);
+  EXPECT_EQ(temp_allocator_->currently_allocated_size, 0);
+}
diff --git a/runtime/executor/test/kernel_resolution_test.cpp b/runtime/executor/test/kernel_resolution_test.cpp
index 7ce16a8e9f3..aae0ff9b7ea 100644
--- a/runtime/executor/test/kernel_resolution_test.cpp
+++ b/runtime/executor/test/kernel_resolution_test.cpp
@@ -34,7 +34,7 @@ using executorch::runtime::KernelKey;
 using executorch::runtime::KernelRuntimeContext;
 using executorch::runtime::Method;
 using executorch::runtime::Program;
-using executorch::runtime::register_kernels;
+using executorch::runtime::register_kernel;
 using executorch::runtime::Result;
 using executorch::runtime::TensorMeta;
 using executorch::runtime::testing::ManagedMemoryManager;
@@ -77,7 +77,7 @@ TEST_F(KernelResolutionTest, InitExecutionPlanSuccess) {
         (void)context;
         *(stack[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({kernel_1});
+  auto s1 = register_kernel(kernel_1);
   EXPECT_EQ(s1, executorch::runtime::Error::Ok);
 
   ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
@@ -109,7 +109,7 @@ TEST_F(KernelResolutionTest, ResolveKernelKeySuccess) {
         (void)context;
         *(stack[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({kernel_1});
+  auto s1 = register_kernel(kernel_1);
   EXPECT_EQ(s1, executorch::runtime::Error::Ok);
 
   ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
diff --git a/runtime/executor/test/managed_memory_manager.h b/runtime/executor/test/managed_memory_manager.h
index 667aa35ca24..a01091527b0 100644
--- a/runtime/executor/test/managed_memory_manager.h
+++ b/runtime/executor/test/managed_memory_manager.h
@@ -27,7 +27,8 @@ class ManagedMemoryManager {
  public:
   ManagedMemoryManager(
       size_t planned_memory_bytes,
-      size_t method_allocator_bytes)
+      size_t method_allocator_bytes,
+      MemoryAllocator* temp_allocator = nullptr)
       : planned_memory_buffer_(new uint8_t[planned_memory_bytes]),
         planned_memory_span_(
             planned_memory_buffer_.get(),
@@ -35,7 +36,7 @@ class ManagedMemoryManager {
         planned_memory_({&planned_memory_span_, 1}),
         method_allocator_pool_(new uint8_t[method_allocator_bytes]),
         method_allocator_(method_allocator_bytes, method_allocator_pool_.get()),
-        memory_manager_(&method_allocator_, &planned_memory_) {}
+        memory_manager_(&method_allocator_, &planned_memory_, temp_allocator) {}
 
   MemoryManager& get() {
     return memory_manager_;
diff --git a/runtime/executor/test/method_meta_test.cpp b/runtime/executor/test/method_meta_test.cpp
index 7e0e06099ee..bd48f64d98f 100644
--- a/runtime/executor/test/method_meta_test.cpp
+++ b/runtime/executor/test/method_meta_test.cpp
@@ -61,6 +61,7 @@ void check_tensor(const TensorInfo& tensor_info) {
   EXPECT_EQ(dim_order.size(), 2);
   EXPECT_EQ(dim_order[0], 0);
   EXPECT_EQ(dim_order[1], 1);
+  EXPECT_EQ(tensor_info.is_memory_planned(), true);
   EXPECT_EQ(tensor_info.nbytes(), 16);
 }
 } // namespace
diff --git a/runtime/executor/test/method_test.cpp b/runtime/executor/test/method_test.cpp
index 827bb2daa11..0163c8ceef9 100644
--- a/runtime/executor/test/method_test.cpp
+++ b/runtime/executor/test/method_test.cpp
@@ -10,17 +10,18 @@
 #include <filesystem>
 
 #include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/executor/test/managed_memory_manager.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/test/utils/DeathTest.h>
-#include <executorch/util/util.h>
 #include <gtest/gtest.h>
 
 using namespace ::testing;
 using exec_aten::ArrayRef;
+using executorch::extension::prepare_input_tensors;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::Method;
@@ -58,11 +59,9 @@ class MethodTest : public ::testing::Test {
     load_program(std::getenv("ET_MODULE_INDEX_PATH"), "index");
     load_program(
         std::getenv("ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH"), "cat");
+    load_program(std::getenv("ET_MODULE_LINEAR_PATH"), "linear");
     load_program(
-        std::getenv("ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH"),
-        "linear_constant_segment");
-    load_program(
-        std::getenv("ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"),
+        std::getenv("DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"),
         "linear_constant_buffer");
   }
 
@@ -80,8 +79,8 @@ TEST_F(MethodTest, MoveTest) {
   ASSERT_EQ(method.error(), Error::Ok);
 
   // Can execute the method.
-  exec_aten::ArrayRef<void*> inputs =
-      torch::executor::util::PrepareInputTensors(*method);
+  auto input_cleanup = prepare_input_tensors(*method);
+  ASSERT_EQ(input_cleanup.error(), Error::Ok);
   Error err = method->execute();
   ASSERT_EQ(err, Error::Ok);
 
@@ -95,8 +94,6 @@ TEST_F(MethodTest, MoveTest) {
   // Can execute the new method.
   err = new_method.execute();
   ASSERT_EQ(err, Error::Ok);
-
-  torch::executor::util::FreeInputs(inputs);
 }
 
 TEST_F(MethodTest, GetInputTests) {
@@ -173,8 +170,8 @@ TEST_F(MethodTest, SetPrimInputTest) {
   ASSERT_EQ(method.error(), Error::Ok);
 
   // Can execute the method.
-  exec_aten::ArrayRef<void*> inputs =
-      torch::executor::util::PrepareInputTensors(*method);
+  auto input_cleanup = prepare_input_tensors(*method);
+  ASSERT_EQ(input_cleanup.error(), Error::Ok);
 
   // The args to the method are x, y, alpha. x and y are tensors handled above
   // alpha is a prim.
@@ -189,8 +186,6 @@ TEST_F(MethodTest, SetPrimInputTest) {
 
   Error err = method->execute();
   EXPECT_EQ(err, Error::Ok);
-
-  torch::executor::util::FreeInputs(inputs);
 }
 
 TEST_F(MethodTest, MethodMetaTest) {
@@ -277,7 +272,7 @@ TEST_F(MethodTest, ConstantSegmentTest) {
   // Execute model with constants stored in segment.
   ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
   Result<Method> method =
-      programs_["linear_constant_segment"]->load_method("forward", &mmm.get());
+      programs_["linear"]->load_method("forward", &mmm.get());
   ASSERT_EQ(method.error(), Error::Ok);
 
   // Can execute the method.
@@ -297,28 +292,28 @@ TEST_F(MethodTest, ConstantBufferTest) {
   ASSERT_EQ(err, Error::Ok);
 }
 
-// TODO(T161163608): Test is disabled due to a resize bug in tensor_index_out of
-// the portable op lib
-
-// TEST_F(MethodTest, OptionalTensorListDeserialization) {
-//   ManagedMemoryManager mmm(kDefaultNonConstMemBytes,
-//   kDefaultRuntimeMemBytes); Result<Method> method =
-//   index_program_->load_method("forward", &mmm.get());
-//   ASSERT_EQ(method.error(), Error::Ok);
+/*
+ * TODO(T161163608): Test is disabled due to a resize bug in tensor_index_out of
+ * the portable op lib
 
-//   // Can execute the method.
-//   exec_aten::ArrayRef<void*> inputs =
-//       executorch::runtime::util::PrepareInputTensors(*method);
-//   Error err = method->execute();
-//   ASSERT_EQ(err, Error::Ok);
+TEST_F(MethodTest, OptionalTensorListDeserialization) {
+  ManagedMemoryManager mmm(kDefaultNonConstMemBytes,
+  kDefaultRuntimeMemBytes); Result<Method> method =
+  index_program_->load_method("forward", &mmm.get());
+  ASSERT_EQ(method.error(), Error::Ok);
 
-//   EXPECT_EQ(method->inputs_size(), 1);
+  // Can execute the method.
+  auto input_cleanup = prepare_input_tensors(*method);
+  ASSERT_EQ(input_cleanup.error(), Error::Ok);
+  Error err = method->execute();
+  ASSERT_EQ(err, Error::Ok);
 
-//   auto outputs = method->get_output(0);
-//   EXPECT_EQ(outputs.toTensor().dim(), 3);
-//   EXPECT_EQ(outputs.toTensor().size(0), 5);
-//   EXPECT_EQ(outputs.toTensor().size(1), 2);
-//   EXPECT_EQ(outputs.toTensor().size(2), 10);
+  EXPECT_EQ(method->inputs_size(), 1);
 
-//   executorch::runtime::util::FreeInputs(inputs);
-// }
+  auto outputs = method->get_output(0);
+  EXPECT_EQ(outputs.toTensor().dim(), 3);
+  EXPECT_EQ(outputs.toTensor().size(0), 5);
+  EXPECT_EQ(outputs.toTensor().size(1), 2);
+  EXPECT_EQ(outputs.toTensor().size(2), 10);
+}
+*/
diff --git a/runtime/executor/test/program_test.cpp b/runtime/executor/test/program_test.cpp
index 00e8b0e234b..80f91f1af6a 100644
--- a/runtime/executor/test/program_test.cpp
+++ b/runtime/executor/test/program_test.cpp
@@ -379,11 +379,32 @@ TEST_F(ProgramTest, DEPRECATEDLoad) {
   EXPECT_EQ(program_res.error(), Error::Ok);
 }
 
+TEST_F(ProgramTest, LoadConstantSegmentWithNoConstantSegment) {
+  Result<Program> program =
+      Program::load(add_loader_.get(), kDefaultVerification);
+  ASSERT_EQ(program.error(), Error::Ok);
+
+  // Load constant segment data should fail.
+  const auto segment_info = DataLoader::SegmentInfo(
+      DataLoader::SegmentInfo::Type::Constant,
+      /*segment_index=*/0);
+  Result<FreeableBuffer> segment =
+      ProgramTestFriend::LoadSegment(&program.get(), segment_info);
+  EXPECT_NE(segment.error(), Error::Ok);
+
+  const executorch_flatbuffer::Program* flatbuffer_program =
+      ProgramTestFriend::GetInternalProgram(&program.get());
+
+  // The constant buffer should be empty.
+  EXPECT_EQ(flatbuffer_program->constant_buffer()->size(), 0);
+
+  // Expect 1 constant segment, placeholder for non-const tensors.
+  EXPECT_EQ(flatbuffer_program->segments()->size(), 1);
+}
+
 TEST_F(ProgramTest, LoadConstantSegment) {
-  // Load the serialized ModuleLinear data, with constants in the segment and no
-  // constants in the flatbuffer.
-  const char* linear_path =
-      std::getenv("ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH");
+  // Load the serialized ModuleLinear data, with constants in the segment.
+  const char* linear_path = std::getenv("ET_MODULE_LINEAR_PATH");
   Result<FileDataLoader> linear_loader = FileDataLoader::from(linear_path);
   ASSERT_EQ(linear_loader.error(), Error::Ok);
 
@@ -424,11 +445,11 @@ TEST_F(ProgramTest, LoadConstantSegment) {
   EXPECT_GE(flatbuffer_program->constant_segment()->offsets()->size(), 1);
 }
 
-TEST_F(ProgramTest, LoadConstantSegmentWithNoConstantSegment) {
+TEST_F(ProgramTest, LoadConstantSegmentWhenConstantBufferExists) {
   // Load the serialized ModuleLinear data, with constants in the flatbuffer and
   // no constants in the segment.
   const char* linear_path =
-      std::getenv("ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH");
+      std::getenv("DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH");
   Result<FileDataLoader> linear_loader = FileDataLoader::from(linear_path);
   ASSERT_EQ(linear_loader.error(), Error::Ok);
 
@@ -505,8 +526,8 @@ TEST_F(ProgramTest, LoadFromMutableSegment) {
   const executorch_flatbuffer::Program* flatbuffer_program =
       ProgramTestFriend::GetInternalProgram(&program.get());
 
-  // Expect 1 segment. 1 mutable segment and no constant segment.
-  EXPECT_EQ(flatbuffer_program->segments()->size(), 1);
+  // Expect 2 segments. 1 mutable segment and 1 constant segment.
+  EXPECT_EQ(flatbuffer_program->segments()->size(), 2);
 
   // Expect a mutable data segment.
   EXPECT_EQ(flatbuffer_program->mutable_data_segments()->size(), 1);
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
index 401581421df..72923e9868f 100644
--- a/runtime/executor/test/targets.bzl
+++ b/runtime/executor/test/targets.bzl
@@ -19,7 +19,7 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/exir/backend/test/...",
                 "//executorch/runtime/backend/...",
                 "//executorch/extension/pybindings/...",
-                "//executorch/sdk/fb/runners/...",
+                "//executorch/devtools/fb/runners/...",
                 "//executorch/test/...",
                 "//executorch/examples/...",
             ],
@@ -43,7 +43,7 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/exir/backend/test/...",
                 "//executorch/runtime/backend/...",
                 "//executorch/extension/pybindings/...",
-                "//executorch/sdk/fb/runners/...",
+                "//executorch/devtools/fb/runners/...",
                 "//executorch/test/...",
                 "//executorch/examples/...",
             ],
@@ -97,6 +97,8 @@ def define_common_targets(is_fbcode = False):
     # file in fbcode. See https://fburl.com/9esapdmd
     if not runtime.is_oss and is_fbcode:
         modules_env = {
+            # Deprecated model that still works with ExecuTorch runtime.
+            "DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH": "$(location fbcode//executorch/test/models/deprecated:ModuleLinear-no-constant-segment.pte)",
             # The tests use this var to find the program file to load. This uses
             # an fbcode target path because the authoring/export tools
             # intentionally don't work in xplat (since they're host-only tools).
@@ -104,8 +106,7 @@ def define_common_targets(is_fbcode = False):
             "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
             "ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleDynamicCatUnallocatedIO.pte])",
             "ET_MODULE_INDEX_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleIndex.pte])",
-            "ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear-no-constant-segment.pte])",
-            "ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear.pte])",
+            "ET_MODULE_LINEAR_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear.pte])",
             "ET_MODULE_MULTI_ENTRY_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleMultipleEntry.pte])",
             "ET_MODULE_SIMPLE_TRAIN_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleSimpleTrain.pte])",
         }
@@ -120,7 +121,7 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/runtime/executor:program",
                 "//executorch/kernels/portable:generated_lib",
                 "//executorch/extension/data_loader:file_data_loader",
-                "//executorch/util:util",
+                "//executorch/extension/runner_util:inputs",
             ],
             env = modules_env,
         )
@@ -133,8 +134,8 @@ def define_common_targets(is_fbcode = False):
             deps = [
                 ":managed_memory_manager",
                 "//executorch/runtime/executor:program",
-                "//executorch/util:util",
                 "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/extension/runner_util:inputs",
                 "//executorch/kernels/portable:generated_lib",
             ],
             env = modules_env,
@@ -175,7 +176,6 @@ def define_common_targets(is_fbcode = False):
                 ":managed_memory_manager",
                 "//executorch/runtime/executor:program",
                 "//executorch/runtime/kernel:operator_registry",
-                "//executorch/util:util",
                 "//executorch/extension/data_loader:file_data_loader",
             ],
             env = modules_env,
@@ -189,12 +189,12 @@ def define_common_targets(is_fbcode = False):
             deps = [
                 ":managed_memory_manager",
                 "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/extension/runner_util:inputs",
                 "//executorch/runtime/core:core",
                 "//executorch/runtime/executor:program",
                 "//executorch/runtime/kernel:kernel_runtime_context",
                 "//executorch/runtime/kernel:operator_registry",
                 "//executorch/runtime/platform:platform",
-                "//executorch/util:util",
             ],
             env = modules_env,
         )
@@ -210,7 +210,7 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/runtime/executor:program",
                 "//executorch/extension/data_loader:buffer_data_loader",
                 "//executorch/extension/data_loader:file_data_loader",
-                "//executorch/util:util",
+                "//executorch/extension/runner_util:inputs",
             ],
             env = {
                 # The tests use these vars to find the program files to load.
diff --git a/runtime/executor/test/test_backend_compiler_lib.cpp b/runtime/executor/test/test_backend_compiler_lib.cpp
index 20028b2dc5a..7bfd7689a47 100644
--- a/runtime/executor/test/test_backend_compiler_lib.cpp
+++ b/runtime/executor/test/test_backend_compiler_lib.cpp
@@ -17,13 +17,13 @@ using executorch::runtime::ArrayRef;
 using executorch::runtime::Backend;
 using executorch::runtime::BackendExecutionContext;
 using executorch::runtime::BackendInitContext;
+using executorch::runtime::BackendInterface;
 using executorch::runtime::CompileSpec;
 using executorch::runtime::DelegateHandle;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::FreeableBuffer;
 using executorch::runtime::MemoryAllocator;
-using executorch::runtime::PyTorchBackendInterface;
 using executorch::runtime::Result;
 
 struct DemoOp {
@@ -38,7 +38,7 @@ struct DemoOpList {
   size_t numops;
 };
 
-class BackendWithCompiler final : public PyTorchBackendInterface {
+class BackendWithCompiler final : public BackendInterface {
   int max_shape = 4;
 
  public:
diff --git a/runtime/executor/test/test_backend_with_delegate_mapping.cpp b/runtime/executor/test/test_backend_with_delegate_mapping.cpp
index ba580c98d70..ead99c1305a 100644
--- a/runtime/executor/test/test_backend_with_delegate_mapping.cpp
+++ b/runtime/executor/test/test_backend_with_delegate_mapping.cpp
@@ -18,13 +18,13 @@ using executorch::runtime::ArrayRef;
 using executorch::runtime::Backend;
 using executorch::runtime::BackendExecutionContext;
 using executorch::runtime::BackendInitContext;
+using executorch::runtime::BackendInterface;
 using executorch::runtime::CompileSpec;
 using executorch::runtime::DelegateHandle;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::FreeableBuffer;
 using executorch::runtime::MemoryAllocator;
-using executorch::runtime::PyTorchBackendInterface;
 using executorch::runtime::Result;
 
 struct DemoOp {
@@ -37,7 +37,7 @@ struct DemoOpList {
   size_t numops;
 };
 
-class BackendWithDelegateMapping final : public PyTorchBackendInterface {
+class BackendWithDelegateMapping final : public BackendInterface {
  public:
   ~BackendWithDelegateMapping() override = default;
 
diff --git a/runtime/kernel/kernel_runtime_context.h b/runtime/kernel/kernel_runtime_context.h
index 213cb45ba1e..96ad3d51e36 100644
--- a/runtime/kernel/kernel_runtime_context.h
+++ b/runtime/kernel/kernel_runtime_context.h
@@ -110,21 +110,22 @@ class KernelRuntimeContext {
 } // namespace runtime
 } // namespace executorch
 
-namespace torch {
-namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::KernelRuntimeContext;
-} // namespace executor
-} // namespace torch
-
-// TODO(T147221312): Remove these aliases once all code uses
-// KernelRuntimeContext.
-namespace exec_aten {
-using RuntimeContext = ::executorch::runtime::KernelRuntimeContext;
-} // namespace exec_aten
 namespace torch {
 namespace executor {
+/// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead.
+using ::executorch::runtime::KernelRuntimeContext;
+/// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead.
 using RuntimeContext = ::executorch::runtime::KernelRuntimeContext;
 } // namespace executor
 } // namespace torch
+namespace executorch {
+namespace aten {
+/// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead.
+using RuntimeContext = ::executorch::runtime::KernelRuntimeContext;
+} // namespace aten
+} // namespace executorch
+// DEPRECATED: The exec_aten:: namespace is deprecated. Use executorch::aten::
+// instead.
+namespace exec_aten = ::executorch::aten;
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp
index a8fd50d7b91..78aa0a51732 100644
--- a/runtime/kernel/operator_registry.cpp
+++ b/runtime/kernel/operator_registry.cpp
@@ -8,53 +8,63 @@
 
 #include <executorch/runtime/kernel/operator_registry.h>
 
-#include <executorch/runtime/platform/runtime.h>
-#include <executorch/runtime/platform/system.h>
 #include <cinttypes>
 
 #include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/platform.h>
+#include <executorch/runtime/platform/system.h>
 
 namespace executorch {
 namespace runtime {
 
-OperatorRegistry& getOperatorRegistry();
-OperatorRegistry& getOperatorRegistry() {
-  static OperatorRegistry operator_registry;
-  return operator_registry;
-}
-
-Error register_kernels(const ArrayRef<Kernel>& kernels) {
-  Error success = getOperatorRegistry().register_kernels(kernels);
-  if (success == Error::InvalidArgument || success == Error::Internal) {
-    ET_CHECK_MSG(
-        false,
-        "Kernel registration failed with error %" PRIu32
-        ", see error log for details.",
-        static_cast<uint32_t>(success));
-  }
-  return success;
-}
-
-Error OperatorRegistry::register_kernels(const ArrayRef<Kernel>& kernels) {
-  // Operator registration happens in static initialization time when PAL init
-  // may or may not happen already. Here we are assuming et_pal_init() doesn't
-  // have any side effect even if falled multiple times.
+namespace {
+
+// Maximum number of operators and their associated kernels that can be
+// registered.
+#ifdef MAX_KERNEL_NUM
+constexpr uint32_t kMaxRegisteredKernels = MAX_KERNEL_NUM;
+#else
+constexpr uint32_t kMaxOperators = 250;
+constexpr uint32_t kMaxKernelsPerOp = 8;
+constexpr uint32_t kMaxRegisteredKernels = kMaxOperators * kMaxKernelsPerOp;
+#endif
+
+// Data that backs the kernel table. Since Kernel has a custom default
+// constructor (implicitly, because it contains KernelKey, which has a custom
+// ctor), some toolchains don't like having a global array of them: it would
+// require constructing them at init time. Since we don't care about the values
+// until we add each entry to the table, allocate static zeroed memory instead
+// and point the table at it.
+// @lint-ignore CLANGTIDY facebook-hte-CArray
+alignas(sizeof(Kernel)) uint8_t
+    registered_kernels_data[kMaxRegisteredKernels * sizeof(Kernel)];
+
+/// Global table of registered kernels.
+Kernel* registered_kernels = reinterpret_cast<Kernel*>(registered_kernels_data);
+
+/// The number of kernels registered in the table.
+size_t num_registered_kernels = 0;
+
+// Registers the kernels, but may return an error.
+Error register_kernels_internal(const Span<const Kernel> kernels) {
+  // Operator registration happens in static initialization time before or after
+  // PAL init, so call it here. It is safe to call multiple times.
   ::et_pal_init();
 
-  if (kernels.size() + this->num_kernels_ > kMaxNumOfKernels) {
+  if (kernels.size() + num_registered_kernels > kMaxRegisteredKernels) {
     ET_LOG(
         Error,
-        "The total number of kernels to be registered is larger than the limit %" PRIu32
-        ". %" PRIu32
-        " kernels are already registered and we're trying to register another %" PRIu32
-        " kernels.",
-        kMaxNumOfKernels,
-        (uint32_t)this->num_kernels_,
+        "The total number of kernels to be registered is larger than the limit "
+        "%" PRIu32 ". %" PRIu32
+        " kernels are already registered and we're trying to register another "
+        "%" PRIu32 " kernels.",
+        kMaxRegisteredKernels,
+        (uint32_t)num_registered_kernels,
         (uint32_t)kernels.size());
     ET_LOG(Error, "======== Kernels already in the registry: ========");
-    for (size_t i = 0; i < this->num_kernels_; i++) {
-      ET_LOG(Error, "%s", this->kernels_[i].name_);
-      ET_LOG_KERNEL_KEY(this->kernels_[i].kernel_key_);
+    for (size_t i = 0; i < num_registered_kernels; i++) {
+      ET_LOG(Error, "%s", registered_kernels[i].name_);
+      ET_LOG_KERNEL_KEY(registered_kernels[i].kernel_key_);
     }
     ET_LOG(Error, "======== Kernels being registered: ========");
     for (size_t i = 0; i < kernels.size(); i++) {
@@ -67,9 +77,9 @@ Error OperatorRegistry::register_kernels(const ArrayRef<Kernel>& kernels) {
   const char* lib_name = et_pal_get_shared_library_name(kernels.data());
 
   for (const auto& kernel : kernels) {
-    // linear search. This is fine if the number of kernels are small.
-    for (int32_t i = 0; i < this->num_kernels_; i++) {
-      Kernel k = this->kernels_[i];
+    // Linear search. This is fine if the number of kernels is small.
+    for (int32_t i = 0; i < num_registered_kernels; i++) {
+      Kernel k = registered_kernels[i];
       if (strcmp(kernel.name_, k.name_) == 0 &&
           kernel.kernel_key_ == k.kernel_key_) {
         ET_LOG(Error, "Re-registering %s, from %s", k.name_, lib_name);
@@ -77,7 +87,7 @@ Error OperatorRegistry::register_kernels(const ArrayRef<Kernel>& kernels) {
         return Error::InvalidArgument;
       }
     }
-    this->kernels_[this->num_kernels_++] = kernel;
+    registered_kernels[num_registered_kernels++] = kernel;
   }
   ET_LOG(
       Debug,
@@ -87,11 +97,23 @@ Error OperatorRegistry::register_kernels(const ArrayRef<Kernel>& kernels) {
   return Error::Ok;
 }
 
-bool hasOpsFn(const char* name, ArrayRef<TensorMeta> kernel_key) {
-  return getOperatorRegistry().hasOpsFn(name, kernel_key);
+} // namespace
+
+// Registers the kernels, but panics if an error occurs. Always returns Ok.
+Error register_kernels(const Span<const Kernel> kernels) {
+  Error success = register_kernels_internal(kernels);
+  if (success == Error::InvalidArgument || success == Error::Internal) {
+    ET_CHECK_MSG(
+        false,
+        "Kernel registration failed with error %" PRIu32
+        ", see error log for details.",
+        static_cast<uint32_t>(success));
+  }
+  return success;
 }
 
-static int copy_char_as_number_to_buf(char num, char* buf) {
+namespace {
+int copy_char_as_number_to_buf(char num, char* buf) {
   if ((char)num < 10) {
     *buf = '0' + (char)num;
     buf += 1;
@@ -104,10 +126,10 @@ static int copy_char_as_number_to_buf(char num, char* buf) {
     return 2;
   }
 }
+} // namespace
 
-void make_kernel_key_string(ArrayRef<TensorMeta> key, char* buf);
-
-void make_kernel_key_string(ArrayRef<TensorMeta> key, char* buf) {
+namespace internal {
+void make_kernel_key_string(Span<const TensorMeta> key, char* buf) {
   if (key.empty()) {
     // If no tensor is present in an op, kernel key does not apply
     return;
@@ -130,61 +152,43 @@ void make_kernel_key_string(ArrayRef<TensorMeta> key, char* buf) {
     buf += 1;
   }
 }
+} // namespace internal
 
-bool OperatorRegistry::hasOpsFn(
+bool registry_has_op_function(
     const char* name,
-    ArrayRef<TensorMeta> meta_list) {
-  char buf[KernelKey::MAX_SIZE] = {0};
-  make_kernel_key_string(meta_list, buf);
-  KernelKey kernel_key = KernelKey(buf);
-
-  for (size_t idx = 0; idx < this->num_kernels_; idx++) {
-    if (strcmp(this->kernels_[idx].name_, name) == 0) {
-      if (this->kernels_[idx].kernel_key_.is_fallback() ||
-          this->kernels_[idx].kernel_key_ == kernel_key) {
-        return true;
-      }
-    }
-  }
-
-  return false;
+    Span<const TensorMeta> meta_list) {
+  return get_op_function_from_registry(name, meta_list).ok();
 }
 
-const OpFunction& getOpsFn(const char* name, ArrayRef<TensorMeta> kernel_key) {
-  return getOperatorRegistry().getOpsFn(name, kernel_key);
-}
-
-const OpFunction& OperatorRegistry::getOpsFn(
+Result<OpFunction> get_op_function_from_registry(
     const char* name,
-    ArrayRef<TensorMeta> meta_list) {
+    Span<const TensorMeta> meta_list) {
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
   char buf[KernelKey::MAX_SIZE] = {0};
-  make_kernel_key_string(meta_list, buf);
+  internal::make_kernel_key_string(meta_list, buf);
   KernelKey kernel_key = KernelKey(buf);
 
   int32_t fallback_idx = -1;
-  for (size_t idx = 0; idx < this->num_kernels_; idx++) {
-    if (strcmp(this->kernels_[idx].name_, name) == 0) {
-      if (this->kernels_[idx].kernel_key_ == kernel_key) {
-        return this->kernels_[idx].op_;
+  for (size_t idx = 0; idx < num_registered_kernels; idx++) {
+    if (strcmp(registered_kernels[idx].name_, name) == 0) {
+      if (registered_kernels[idx].kernel_key_ == kernel_key) {
+        return registered_kernels[idx].op_;
       }
-      if (this->kernels_[idx].kernel_key_.is_fallback()) {
+      if (registered_kernels[idx].kernel_key_.is_fallback()) {
         fallback_idx = idx;
       }
     }
   }
   if (fallback_idx != -1) {
-    return this->kernels_[fallback_idx].op_;
+    return registered_kernels[fallback_idx].op_;
   }
-  ET_CHECK_MSG(false, "kernel '%s' not found.", name);
+  ET_LOG(Error, "kernel '%s' not found.", name);
   ET_LOG_TENSOR_META(meta_list);
+  return Error::OperatorMissing;
 }
 
-ArrayRef<Kernel> get_kernels() {
-  return getOperatorRegistry().get_kernels();
-}
-
-ArrayRef<Kernel> OperatorRegistry::get_kernels() {
-  return ArrayRef<Kernel>(this->kernels_, this->num_kernels_);
+Span<const Kernel> get_registered_kernels() {
+  return {registered_kernels, num_registered_kernels};
 }
 
 } // namespace runtime
diff --git a/runtime/kernel/operator_registry.h b/runtime/kernel/operator_registry.h
index f1be83306f8..6d6c77bd769 100644
--- a/runtime/kernel/operator_registry.h
+++ b/runtime/kernel/operator_registry.h
@@ -14,8 +14,11 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
 #include <executorch/runtime/platform/compiler.h>
 #include <executorch/runtime/platform/platform.h>
+
 // Debug switch for operator registry
 #if defined(ET_OP_REGISTRY_DEBUG)
 #include <ostream>
@@ -48,12 +51,10 @@ using OpFunction = void (*)(KernelRuntimeContext&, EValue**);
  */
 struct TensorMeta {
   exec_aten::ScalarType dtype_;
-  ArrayRef<exec_aten::DimOrderType> dim_order_;
+  Span<exec_aten::DimOrderType> dim_order_;
 
   TensorMeta() = default;
-  TensorMeta(
-      exec_aten::ScalarType dtype,
-      ArrayRef<exec_aten::DimOrderType> order)
+  TensorMeta(exec_aten::ScalarType dtype, Span<exec_aten::DimOrderType> order)
       : dtype_(dtype), dim_order_(order) {}
 
   bool operator==(const TensorMeta& other) const {
@@ -190,73 +191,49 @@ struct Kernel {
   Kernel() {}
 };
 
-// Maximum number of operators and their associated kernels that can be
-// registered.
-constexpr uint32_t kOperatorTableMaxSize = 250;
-constexpr uint32_t kMaxNumOfKernelPerOp = 8;
-#ifdef MAX_KERNEL_NUM
-constexpr uint32_t kMaxNumOfKernels = MAX_KERNEL_NUM;
-#else
-constexpr uint32_t kMaxNumOfKernels =
-    kOperatorTableMaxSize * kMaxNumOfKernelPerOp;
-#endif
+namespace internal {
+void make_kernel_key_string(Span<const TensorMeta> key, char* buf);
+} // namespace internal
+
 /**
- * See OperatorRegistry::hasOpsFn()
+ * Checks whether an operator exists with a given name and TensorMeta list. When
+ * TensorMeta is empty, it means this op does not have specialized kernels, so
+ * it checks whether it has any fallback kernels.
  */
-bool hasOpsFn(const char* name, ArrayRef<TensorMeta> meta_list = {});
+bool registry_has_op_function(
+    const char* name,
+    Span<const TensorMeta> meta_list = {});
 
 /**
- * See OperatorRegistry::getOpsFn()
+ * Returns the operator with a given name and TensorMeta list, if present.
  */
-const OpFunction& getOpsFn(
+::executorch::runtime::Result<OpFunction> get_op_function_from_registry(
     const char* name,
-    ArrayRef<TensorMeta> meta_list = {});
+    Span<const TensorMeta> meta_list = {});
 
 /**
- * See OperatorRegistry::get_kernels()
+ * Returns all registered kernels.
  */
-ArrayRef<Kernel> get_kernels();
+Span<const Kernel> get_registered_kernels();
 
 /**
- * See OperatorRegistry::register_kernels(). Notice that the returned Error
- * object should be handled internally and the reason for keep returning is to
- * satisfy the requirement to run this in static initialization time.
+ * Registers the provided kernels.
+ *
+ * @param[in] kernels Kernel objects to register.
+ * @retval Error::Ok always. Panics on error. This function needs to return a
+ *     non-void type to run at static initialization time.
  */
-ET_NODISCARD Error register_kernels(const ArrayRef<Kernel>&);
-
-struct OperatorRegistry {
- public:
-  OperatorRegistry() : num_kernels_(0) {}
-
-  /**
-   * Registers the Kernels object (i.e. string name and function reference
-   * pair). The kernels will be merged into Operators based on the op name.
-   *
-   * @param[in] kernels Kernel object
-   * @retval Error code representing whether registration was successful.
-   */
-  ET_NODISCARD Error register_kernels(const ArrayRef<Kernel>&);
-
-  /**
-   * Checks whether an operator with a given name and TensorMeta list.
-   * When TensorMeta is empty, it means this op does not have specialized
-   * kernels, so it checks whether it has any fallback kernels.
-   */
-  bool hasOpsFn(const char* name, ArrayRef<TensorMeta> meta_list);
+ET_NODISCARD Error register_kernels(const Span<const Kernel>);
 
-  /**
-   * Get the operator with a given name and TensorMeta list
-   */
-  const OpFunction& getOpsFn(const char* name, ArrayRef<TensorMeta> meta_list);
-
-  /**
-   * Return all registered operators.
-   */
-  ArrayRef<Kernel> get_kernels();
-
- private:
-  Kernel kernels_[kMaxNumOfKernels];
-  uint32_t num_kernels_;
+/**
+ * Registers a single kernel.
+ *
+ * @param[in] kernel Kernel object to register.
+ * @retval Error::Ok always. Panics on error. This function needs to return a
+ *     non-void type to run at static initialization time.
+ */
+ET_NODISCARD inline Error register_kernel(const Kernel& kernel) {
+  return register_kernels({&kernel, 1});
 };
 
 } // namespace runtime
@@ -266,16 +243,32 @@ namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::get_kernels;
-using ::executorch::runtime::getOpsFn;
-using ::executorch::runtime::hasOpsFn;
 using ::executorch::runtime::Kernel;
 using ::executorch::runtime::KernelKey;
 using ::executorch::runtime::KernelRuntimeContext;
-using ::executorch::runtime::OperatorRegistry;
 using ::executorch::runtime::OpFunction;
-using ::executorch::runtime::register_kernels;
 using ::executorch::runtime::TensorMeta;
-using RuntimeContext = ::executorch::runtime::KernelRuntimeContext;
+using KernelRuntimeContext = ::executorch::runtime::KernelRuntimeContext;
+
+inline ::executorch::runtime::Error register_kernels(ArrayRef<Kernel> kernels) {
+  return ::executorch::runtime::register_kernels(
+      {kernels.data(), kernels.size()});
+}
+inline OpFunction getOpsFn(
+    const char* name,
+    ArrayRef<TensorMeta> meta_list = {}) {
+  auto result = ::executorch::runtime::get_op_function_from_registry(
+      name, {meta_list.data(), meta_list.size()});
+  ET_CHECK(result.ok()); // get_op_function_from_registry() logs details.
+  return *result;
+}
+inline bool hasOpsFn(const char* name, ArrayRef<TensorMeta> meta_list = {}) {
+  return ::executorch::runtime::registry_has_op_function(
+      name, {meta_list.data(), meta_list.size()});
+}
+inline ArrayRef<Kernel> get_kernels() {
+  Span<const Kernel> kernels = ::executorch::runtime::get_registered_kernels();
+  return ArrayRef<Kernel>(kernels.data(), kernels.size());
+}
 } // namespace executor
 } // namespace torch
diff --git a/runtime/kernel/test/kernel_double_registration_test.cpp b/runtime/kernel/test/kernel_double_registration_test.cpp
index bef3b46f46b..1739dffd31b 100644
--- a/runtime/kernel/test/kernel_double_registration_test.cpp
+++ b/runtime/kernel/test/kernel_double_registration_test.cpp
@@ -20,6 +20,7 @@ using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::Kernel;
 using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::register_kernels;
 
 class KernelDoubleRegistrationTest : public ::testing::Test {
  public:
@@ -33,10 +34,9 @@ TEST_F(KernelDoubleRegistrationTest, Basic) {
       "aten::add.out",
       "v1/7;0,1,2,3|7;0,1,2,3|7;0,1,2,3",
       [](KernelRuntimeContext&, EValue**) {})};
-  ArrayRef<Kernel> kernels_array = ArrayRef<Kernel>(kernels);
   Error err = Error::InvalidArgument;
 
   ET_EXPECT_DEATH(
-      { auto res = register_kernels(kernels_array); },
+      { (void)register_kernels({kernels}); },
       std::to_string(static_cast<uint32_t>(err)));
 }
diff --git a/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp b/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp
index 16520358c75..6f6fe4b9e1b 100644
--- a/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp
+++ b/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp
@@ -19,9 +19,10 @@ using namespace ::testing;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
-using executorch::runtime::hasOpsFn;
 using executorch::runtime::Kernel;
 using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::register_kernels;
+using executorch::runtime::registry_has_op_function;
 
 class OperatorRegistryMaxKernelNumTest : public ::testing::Test {
  public:
@@ -33,11 +34,10 @@ class OperatorRegistryMaxKernelNumTest : public ::testing::Test {
 // Register one kernel when max_kernel_num=1; success
 TEST_F(OperatorRegistryMaxKernelNumTest, RegisterOneOp) {
   Kernel kernels[] = {Kernel("foo", [](KernelRuntimeContext&, EValue**) {})};
-  ArrayRef<Kernel> kernels_array = ArrayRef<Kernel>(kernels);
-  auto s1 = register_kernels(kernels_array);
+  auto s1 = register_kernels({kernels});
   EXPECT_EQ(s1, Error::Ok);
-  EXPECT_FALSE(hasOpsFn("fpp"));
-  EXPECT_TRUE(hasOpsFn("foo"));
+  EXPECT_FALSE(registry_has_op_function("fpp"));
+  EXPECT_TRUE(registry_has_op_function("foo"));
 }
 
 // Register two kernels when max_kernel_num=1; fail
@@ -45,8 +45,7 @@ TEST_F(OperatorRegistryMaxKernelNumTest, RegisterTwoOpsFail) {
   Kernel kernels[] = {
       Kernel("foo1", [](KernelRuntimeContext&, EValue**) {}),
       Kernel("foo2", [](KernelRuntimeContext&, EValue**) {})};
-  ArrayRef<Kernel> kernels_array = ArrayRef<Kernel>(kernels);
   ET_EXPECT_DEATH(
-      { (void)register_kernels(kernels_array); },
+      { (void)register_kernels({kernels}); },
       "The total number of kernels to be registered is larger than the limit 1");
 }
diff --git a/runtime/kernel/test/operator_registry_test.cpp b/runtime/kernel/test/operator_registry_test.cpp
index 60cd5723cd0..57439a2bd0f 100644
--- a/runtime/kernel/test/operator_registry_test.cpp
+++ b/runtime/kernel/test/operator_registry_test.cpp
@@ -10,6 +10,8 @@
 #include <vector>
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
 #include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/kernel/test/test_util.h>
@@ -20,15 +22,17 @@ using namespace ::testing;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using executorch::runtime::ArrayRef;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
-using executorch::runtime::hasOpsFn;
+using executorch::runtime::get_op_function_from_registry;
 using executorch::runtime::Kernel;
 using executorch::runtime::KernelKey;
 using executorch::runtime::KernelRuntimeContext;
 using executorch::runtime::OpFunction;
 using executorch::runtime::register_kernels;
+using executorch::runtime::registry_has_op_function;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
 using executorch::runtime::TensorMeta;
 using executorch::runtime::testing::make_kernel_key;
 
@@ -41,18 +45,18 @@ class OperatorRegistryTest : public ::testing::Test {
 
 TEST_F(OperatorRegistryTest, Basic) {
   Kernel kernels[] = {Kernel("foo", [](KernelRuntimeContext&, EValue**) {})};
-  ArrayRef<Kernel> kernels_array = ArrayRef<Kernel>(kernels);
-  auto s1 = register_kernels(kernels_array);
-  EXPECT_FALSE(hasOpsFn("fpp"));
-  EXPECT_TRUE(hasOpsFn("foo"));
+  Span<const Kernel> kernels_span(kernels);
+  (void)register_kernels(kernels_span);
+  EXPECT_FALSE(registry_has_op_function("fpp"));
+  EXPECT_TRUE(registry_has_op_function("foo"));
 }
 
 TEST_F(OperatorRegistryTest, RegisterOpsMoreThanOnceDie) {
   Kernel kernels[] = {
       Kernel("foo", [](KernelRuntimeContext&, EValue**) {}),
       Kernel("foo", [](KernelRuntimeContext&, EValue**) {})};
-  ArrayRef<Kernel> kernels_array = ArrayRef<Kernel>(kernels);
-  ET_EXPECT_DEATH({ auto res = register_kernels(kernels_array); }, "");
+  Span<const Kernel> kernels_span = Span<const Kernel>(kernels);
+  ET_EXPECT_DEATH({ (void)register_kernels(kernels_span); }, "");
 }
 
 constexpr int BUF_SIZE = KernelKey::MAX_SIZE;
@@ -91,24 +95,31 @@ TEST_F(OperatorRegistryTest, RegisterKernels) {
         (void)context;
         *(stack[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({kernel_1});
+  auto s1 = register_kernels({&kernel_1, 1});
   EXPECT_EQ(s1, Error::Ok);
 
   Tensor::DimOrderType dims[] = {0, 1, 2, 3};
-  auto dim_order_type = ArrayRef<Tensor::DimOrderType>(dims, 4);
+  auto dim_order_type = Span<Tensor::DimOrderType>(dims, 4);
   TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)};
-  ArrayRef<TensorMeta> user_kernel_key = ArrayRef<TensorMeta>(meta, 1);
-  EXPECT_TRUE(hasOpsFn("test::boo", user_kernel_key));
+  Span<const TensorMeta> user_kernel_key(meta);
+
   // no fallback kernel is registered
-  EXPECT_FALSE(hasOpsFn("test::boo", {}));
-  OpFunction func = getOpsFn("test::boo", user_kernel_key);
+  EXPECT_FALSE(registry_has_op_function("test::boo", {}));
+  Result<OpFunction> fallback_func =
+      get_op_function_from_registry("test::boo", {});
+  EXPECT_NE(fallback_func.error(), Error::Ok);
+
+  EXPECT_TRUE(registry_has_op_function("test::boo", user_kernel_key));
+  Result<OpFunction> func =
+      get_op_function_from_registry("test::boo", user_kernel_key);
+  EXPECT_EQ(func.error(), Error::Ok);
 
   EValue values[1];
   values[0] = Scalar(0);
   EValue* kernels[1];
   kernels[0] = &values[0];
   KernelRuntimeContext context{};
-  func(context, kernels);
+  (*func)(context, kernels);
 
   auto val = values[0].toScalar().to<int64_t>();
   ASSERT_EQ(val, 100);
@@ -136,18 +147,18 @@ TEST_F(OperatorRegistryTest, RegisterTwoKernels) {
   auto s1 = register_kernels(kernels);
   // has both kernels
   Tensor::DimOrderType dims[] = {0, 1, 2, 3};
-  auto dim_order_type = ArrayRef<Tensor::DimOrderType>(dims, 4);
+  auto dim_order_type = Span<Tensor::DimOrderType>(dims, 4);
   TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)};
-  ArrayRef<TensorMeta> user_kernel_key_1 = ArrayRef<TensorMeta>(meta, 1);
+  Span<const TensorMeta> user_kernel_key_1(meta);
 
   TensorMeta meta_2[] = {TensorMeta(ScalarType::Float, dim_order_type)};
-  ArrayRef<TensorMeta> user_kernel_key_2 = ArrayRef<TensorMeta>(meta_2, 1);
-
-  EXPECT_TRUE(hasOpsFn("test::bar", user_kernel_key_1));
-  EXPECT_TRUE(hasOpsFn("test::bar", user_kernel_key_2));
+  Span<const TensorMeta> user_kernel_key_2(meta_2);
 
   // no fallback kernel is registered
-  EXPECT_FALSE(hasOpsFn("test::bar", {}));
+  EXPECT_FALSE(registry_has_op_function("test::bar", {}));
+  Result<OpFunction> fallback_func =
+      get_op_function_from_registry("test::bar", {});
+  EXPECT_NE(fallback_func.error(), Error::Ok);
 
   EValue values[1];
   values[0] = Scalar(0);
@@ -156,16 +167,22 @@ TEST_F(OperatorRegistryTest, RegisterTwoKernels) {
   KernelRuntimeContext context{};
 
   // test kernel_1
-  OpFunction func_1 = getOpsFn("test::bar", user_kernel_key_1);
-  func_1(context, evalues);
+  EXPECT_TRUE(registry_has_op_function("test::bar", user_kernel_key_1));
+  Result<OpFunction> func_1 =
+      get_op_function_from_registry("test::bar", user_kernel_key_1);
+  EXPECT_EQ(func_1.error(), Error::Ok);
+  (*func_1)(context, evalues);
 
   auto val_1 = values[0].toScalar().to<int64_t>();
   ASSERT_EQ(val_1, 100);
 
   // test kernel_2
+  EXPECT_TRUE(registry_has_op_function("test::bar", user_kernel_key_2));
+  Result<OpFunction> func_2 =
+      get_op_function_from_registry("test::bar", user_kernel_key_2);
+  EXPECT_EQ(func_2.error(), Error::Ok);
   values[0] = Scalar(0);
-  OpFunction func_2 = getOpsFn("test::bar", user_kernel_key_2);
-  func_2(context, evalues);
+  (*func_2)(context, evalues);
 
   auto val_2 = values[0].toScalar().to<int64_t>();
   ASSERT_EQ(val_2, 50);
@@ -202,27 +219,26 @@ TEST_F(OperatorRegistryTest, ExecutorChecksKernel) {
         (void)context;
         *(stack[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({kernel_1});
+  auto s1 = register_kernels({&kernel_1, 1});
   EXPECT_EQ(s1, Error::Ok);
 
   Tensor::DimOrderType dims[] = {0, 1, 2, 3};
-  auto dim_order_type = ArrayRef<Tensor::DimOrderType>(dims, 4);
+  auto dim_order_type = Span<Tensor::DimOrderType>(dims, 4);
   TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)};
-  ArrayRef<TensorMeta> user_kernel_key_1 = ArrayRef<TensorMeta>(meta, 1);
-  EXPECT_TRUE(hasOpsFn("test::qux", user_kernel_key_1));
+  Span<const TensorMeta> user_kernel_key_1(meta);
+  EXPECT_TRUE(registry_has_op_function("test::qux", user_kernel_key_1));
 
   Tensor::DimOrderType dims_channel_first[] = {0, 3, 1, 2};
   auto dim_order_type_channel_first =
-      ArrayRef<Tensor::DimOrderType>(dims_channel_first, 4);
+      Span<Tensor::DimOrderType>(dims_channel_first, 4);
   TensorMeta meta_channel_first[] = {
       TensorMeta(ScalarType::Long, dim_order_type_channel_first)};
-  ArrayRef<TensorMeta> user_kernel_key_2 =
-      ArrayRef<TensorMeta>(meta_channel_first, 1);
-  EXPECT_FALSE(hasOpsFn("test::qux", user_kernel_key_2));
+  Span<const TensorMeta> user_kernel_key_2(meta_channel_first);
+  EXPECT_FALSE(registry_has_op_function("test::qux", user_kernel_key_2));
 
   TensorMeta meta_float[] = {TensorMeta(ScalarType::Float, dim_order_type)};
-  ArrayRef<TensorMeta> user_kernel_key_3 = ArrayRef<TensorMeta>(meta_float, 1);
-  EXPECT_FALSE(hasOpsFn("test::qux", ArrayRef<TensorMeta>(user_kernel_key_3)));
+  Span<const TensorMeta> user_kernel_key_3(meta_float);
+  EXPECT_FALSE(registry_has_op_function("test::qux", user_kernel_key_3));
 }
 
 TEST_F(OperatorRegistryTest, ExecutorUsesKernel) {
@@ -235,23 +251,25 @@ TEST_F(OperatorRegistryTest, ExecutorUsesKernel) {
         (void)context;
         *(stack[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({kernel_1});
+  auto s1 = register_kernels({&kernel_1, 1});
   EXPECT_EQ(s1, Error::Ok);
 
   Tensor::DimOrderType dims[] = {0, 1, 2, 3};
-  auto dim_order_type = ArrayRef<Tensor::DimOrderType>(dims, 4);
+  auto dim_order_type = Span<Tensor::DimOrderType>(dims, 4);
   TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)};
-  ArrayRef<TensorMeta> user_kernel_key_1 = ArrayRef<TensorMeta>(meta, 1);
-  EXPECT_TRUE(hasOpsFn("test::quux", ArrayRef<TensorMeta>(meta)));
+  Span<const TensorMeta> user_kernel_key_1(meta);
 
-  OpFunction func = getOpsFn("test::quux", ArrayRef<TensorMeta>(meta));
+  EXPECT_TRUE(registry_has_op_function("test::quux", user_kernel_key_1));
+  Result<OpFunction> func =
+      get_op_function_from_registry("test::quux", user_kernel_key_1);
+  EXPECT_EQ(func.error(), Error::Ok);
 
   EValue values[1];
   values[0] = Scalar(0);
   EValue* kernels[1];
   kernels[0] = &values[0];
   KernelRuntimeContext context{};
-  func(context, kernels);
+  (*func)(context, kernels);
 
   auto val = values[0].toScalar().to<int64_t>();
   ASSERT_EQ(val, 100);
@@ -265,20 +283,21 @@ TEST_F(OperatorRegistryTest, ExecutorUsesFallbackKernel) {
         (void)context;
         *(stack[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({kernel_1});
+  auto s1 = register_kernels({&kernel_1, 1});
   EXPECT_EQ(s1, Error::Ok);
 
-  EXPECT_TRUE(hasOpsFn("test::corge"));
-  EXPECT_TRUE(hasOpsFn("test::corge", ArrayRef<TensorMeta>()));
+  EXPECT_TRUE(registry_has_op_function("test::corge"));
+  EXPECT_TRUE(registry_has_op_function("test::corge", {}));
 
-  OpFunction func = getOpsFn("test::corge", ArrayRef<TensorMeta>());
+  Result<OpFunction> func = get_op_function_from_registry("test::corge", {});
+  EXPECT_EQ(func.error(), Error::Ok);
 
   EValue values[1];
   values[0] = Scalar(0);
   EValue* kernels[1];
   kernels[0] = &values[0];
   KernelRuntimeContext context{};
-  func(context, kernels);
+  (*func)(context, kernels);
 
   auto val = values[0].toScalar().to<int64_t>();
   ASSERT_EQ(val, 100);
diff --git a/runtime/kernel/test/test_kernel_manual_registration.cpp b/runtime/kernel/test/test_kernel_manual_registration.cpp
index c150b61ad73..de8853c7813 100644
--- a/runtime/kernel/test/test_kernel_manual_registration.cpp
+++ b/runtime/kernel/test/test_kernel_manual_registration.cpp
@@ -15,7 +15,7 @@
 
 using namespace ::testing;
 using executorch::runtime::Error;
-using executorch::runtime::hasOpsFn;
+using executorch::runtime::registry_has_op_function;
 
 class KernelManualRegistrationTest : public ::testing::Test {
  public:
@@ -26,15 +26,15 @@ class KernelManualRegistrationTest : public ::testing::Test {
 
 TEST_F(KernelManualRegistrationTest, ManualRegister) {
   // Before registering, we can't find the add operator.
-  EXPECT_FALSE(hasOpsFn("aten::add.out"));
+  EXPECT_FALSE(registry_has_op_function("aten::add.out"));
 
   // Call the generated registration function.
   Error result = torch::executor::register_all_kernels();
   EXPECT_EQ(result, Error::Ok);
 
   // We can now find the registered add operator.
-  EXPECT_TRUE(hasOpsFn("aten::add.out"));
+  EXPECT_TRUE(registry_has_op_function("aten::add.out"));
 
   // We can't find a random other operator.
-  EXPECT_FALSE(hasOpsFn("fpp"));
+  EXPECT_FALSE(registry_has_op_function("fpp"));
 }
diff --git a/runtime/kernel/test/test_util.h b/runtime/kernel/test/test_util.h
index 23993fd39d6..0c6c651af32 100644
--- a/runtime/kernel/test/test_util.h
+++ b/runtime/kernel/test/test_util.h
@@ -16,9 +16,6 @@
 namespace executorch {
 namespace runtime {
 
-// Defined in //executorch/runtime/kernel/operator_registry.cpp.
-void make_kernel_key_string(ArrayRef<TensorMeta> key, char* buf);
-
 namespace testing {
 
 inline void make_kernel_key(
@@ -28,12 +25,11 @@ inline void make_kernel_key(
     char* buf) {
   std::vector<TensorMeta> meta;
   for (auto& t : tensors) {
-    ArrayRef<exec_aten::DimOrderType> dim_order(
-        t.second.data(), t.second.size());
+    Span<exec_aten::DimOrderType> dim_order(t.second.data(), t.second.size());
     meta.emplace_back(t.first, dim_order);
   }
-  auto meatadata = ArrayRef<TensorMeta>(meta.data(), meta.size());
-  make_kernel_key_string(meatadata, buf);
+  Span<const TensorMeta> metadata(meta.data(), meta.size());
+  internal::make_kernel_key_string(metadata, buf);
 }
 
 } // namespace testing
diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h
index f370cd110be..b6f7fc8642f 100644
--- a/runtime/platform/compiler.h
+++ b/runtime/platform/compiler.h
@@ -13,17 +13,32 @@
 
 #pragma once
 
-// Compiler support checks.
+/*
+ * Compiler support checks. Follows the logic used by pytorch/c10/util/C++17.h
+ * but may support older versions.
+ */
 
-#if !defined(__cplusplus)
-#error ExecuTorch must be compiled using a C++ compiler.
+// https://gcc.gnu.org/projects/cxx-status.html#cxx17
+#if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \
+    __GNUC__ < 7
+#error \
+    "You're trying to build ExecuTorch with a too old version of GCC. We need GCC 7 or later."
 #endif
 
-#if __cplusplus < 201103L && (!defined(_MSC_VER) || _MSC_VER < 1600) && \
-    (!defined(__GNUC__) ||                                              \
-     (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ < 40400))
-#error ExecuTorch must use a compiler supporting at least the C++11 standard.
-#error __cplusplus _MSC_VER __GNUC__  __GNUC_MINOR__  __GNUC_PATCHLEVEL__
+// https://clang.llvm.org/cxx_status.html#cxx17
+#if defined(__clang__) && __clang_major__ < 5
+#error \
+    "You're trying to build ExecuTorch with a too old version of Clang. We need Clang 5 or later."
+#endif
+
+#if (defined(_MSC_VER) && (!defined(_MSVC_LANG) || _MSVC_LANG < 201703L)) || \
+    (!defined(_MSC_VER) && __cplusplus < 201703L)
+#error "You need C++17 to compile ExecuTorch"
+#endif
+
+#if defined(_WIN32) && (defined(min) || defined(max))
+#error \
+    "Macro clash with min and max -- define NOMINMAX when compiling your program on Windows"
 #endif
 
 /*
@@ -42,6 +57,7 @@
 #define ET_NORETURN [[noreturn]]
 #define ET_NOINLINE __attribute__((noinline))
 #define ET_INLINE __attribute__((always_inline)) inline
+#define ET_INLINE_ATTRIBUTE __attribute__((always_inline))
 
 #if defined(__GNUC__)
 
@@ -138,6 +154,14 @@
 #endif
 #endif // ifndef
 
+// Define size_t and ssize_t.
+#ifndef _WIN32
+#include <sys/types.h>
+#else
+#include <stddef.h>
+using ssize_t = ptrdiff_t;
+#endif
+
 // DEPRECATED: Use the non-underscore-prefixed versions instead.
 // TODO(T199005537): Remove these once all users have stopped using them.
 #define __ET_DEPRECATED ET_DEPRECATED
diff --git a/runtime/platform/default/minimal.cpp b/runtime/platform/default/minimal.cpp
index e1db2083f4a..8236f993188 100644
--- a/runtime/platform/default/minimal.cpp
+++ b/runtime/platform/default/minimal.cpp
@@ -47,3 +47,9 @@ void et_pal_emit_log_message(
     ET_UNUSED size_t line,
     ET_UNUSED const char* message,
     ET_UNUSED size_t length) {}
+
+void* et_pal_allocate(ET_UNUSED size_t size) {
+  return nullptr;
+}
+
+void et_pal_free(ET_UNUSED void* ptr) {}
diff --git a/runtime/platform/default/posix.cpp b/runtime/platform/default/posix.cpp
index cfc8cafc491..aba504f53e0 100644
--- a/runtime/platform/default/posix.cpp
+++ b/runtime/platform/default/posix.cpp
@@ -170,3 +170,26 @@ void et_pal_emit_log_message(
       message);
   fflush(ET_LOG_OUTPUT_FILE);
 }
+
+/**
+ * NOTE: Core runtime code must not call this directly. It may only be called by
+ * a MemoryAllocator wrapper.
+ *
+ * Allocates size bytes of memory via malloc.
+ *
+ * @param[in] size Number of bytes to allocate.
+ * @returns the allocated memory, or nullptr on failure. Must be freed using
+ *     et_pal_free().
+ */
+void* et_pal_allocate(size_t size) {
+  return malloc(size);
+}
+
+/**
+ * Frees memory allocated by et_pal_allocate().
+ *
+ * @param[in] ptr Pointer to memory to free. May be nullptr.
+ */
+void et_pal_free(void* ptr) {
+  free(ptr);
+}
diff --git a/runtime/platform/platform.h b/runtime/platform/platform.h
index e29dad8e9a8..03cdef8eb2f 100644
--- a/runtime/platform/platform.h
+++ b/runtime/platform/platform.h
@@ -115,4 +115,23 @@ void et_pal_emit_log_message(
     const char* message,
     size_t length) ET_INTERNAL_PLATFORM_WEAKNESS;
 
+/**
+ * NOTE: Core runtime code must not call this directly. It may only be called by
+ * a MemoryAllocator wrapper.
+ *
+ * Allocates size bytes of memory.
+ *
+ * @param[in] size Number of bytes to allocate.
+ * @returns the allocated memory, or nullptr on failure. Must be freed using
+ *     et_pal_free().
+ */
+void* et_pal_allocate(size_t size) ET_INTERNAL_PLATFORM_WEAKNESS;
+
+/**
+ * Frees memory allocated by et_pal_allocate().
+ *
+ * @param[in] ptr Pointer to memory to free. May be nullptr.
+ */
+void et_pal_free(void* ptr) ET_INTERNAL_PLATFORM_WEAKNESS;
+
 } // extern "C"
diff --git a/runtime/platform/test/executor_pal_override_test.cpp b/runtime/platform/test/executor_pal_override_test.cpp
index bb9ea2ce589..9bc500e652e 100644
--- a/runtime/platform/test/executor_pal_override_test.cpp
+++ b/runtime/platform/test/executor_pal_override_test.cpp
@@ -53,12 +53,29 @@ class PalSpy : public PlatformIntercept {
     last_log_message_args.length = length;
   }
 
+  void* allocate(size_t size) override {
+    ++allocate_call_count;
+    last_allocated_size = size;
+    last_allocated_ptr = (void*)0x1234;
+    return nullptr;
+  }
+
+  void free(void* ptr) override {
+    ++free_call_count;
+    last_freed_ptr = ptr;
+  }
+
   virtual ~PalSpy() = default;
 
   size_t init_call_count = 0;
   size_t current_ticks_call_count = 0;
   size_t emit_log_message_call_count = 0;
   et_tick_ratio_t tick_ns_multiplier = {1, 1};
+  size_t allocate_call_count = 0;
+  size_t free_call_count = 0;
+  size_t last_allocated_size = 0;
+  void* last_allocated_ptr = nullptr;
+  void* last_freed_ptr = nullptr;
 
   /// The args that were passed to the most recent call to emit_log_message().
   struct {
@@ -158,4 +175,33 @@ TEST(ExecutorPalOverrideTest, TickToNsMultiplier) {
   EXPECT_EQ(et_pal_ticks_to_ns_multiplier().denominator, 1);
 }
 
+TEST(ExecutorPalOverrideTest, AllocateSmokeTest) {
+  PalSpy spy;
+  InterceptWith iw(spy);
+
+  // Validate that et_pal_allocate is overridden.
+  EXPECT_EQ(spy.allocate_call_count, 0);
+  EXPECT_EQ(spy.last_allocated_ptr, nullptr);
+  et_pal_allocate(4);
+  EXPECT_EQ(spy.allocate_call_count, 1);
+  EXPECT_EQ(spy.last_allocated_size, 4);
+  EXPECT_EQ(spy.last_allocated_ptr, (void*)0x1234);
+}
+
+TEST(ExecutorPalOverrideTest, FreeSmokeTest) {
+  PalSpy spy;
+  InterceptWith iw(spy);
+
+  et_pal_allocate(4);
+  EXPECT_EQ(spy.last_allocated_size, 4);
+  EXPECT_EQ(spy.last_allocated_ptr, (void*)0x1234);
+
+  // Validate that et_pal_free is overridden.
+  EXPECT_EQ(spy.free_call_count, 0);
+  EXPECT_EQ(spy.last_freed_ptr, nullptr);
+  et_pal_free(spy.last_allocated_ptr);
+  EXPECT_EQ(spy.free_call_count, 1);
+  EXPECT_EQ(spy.last_freed_ptr, (void*)0x1234);
+}
+
 #endif
diff --git a/runtime/platform/test/stub_platform.cpp b/runtime/platform/test/stub_platform.cpp
index f7ad2f9ee63..8cee404e4e1 100644
--- a/runtime/platform/test/stub_platform.cpp
+++ b/runtime/platform/test/stub_platform.cpp
@@ -75,6 +75,16 @@ void et_pal_emit_log_message(
       timestamp, level, filename, function, line, message, length);
 }
 
+void* et_pal_allocate(size_t size) {
+  ASSERT_INTERCEPT_INSTALLED();
+  return platform_intercept->allocate(size);
+}
+
+void et_pal_free(void* ptr) {
+  ASSERT_INTERCEPT_INSTALLED();
+  platform_intercept->free(ptr);
+}
+
 } // extern "C"
 
 #include <gtest/gtest.h>
diff --git a/runtime/platform/test/stub_platform.h b/runtime/platform/test/stub_platform.h
index af3756f3136..de5599b53b0 100644
--- a/runtime/platform/test/stub_platform.h
+++ b/runtime/platform/test/stub_platform.h
@@ -45,6 +45,12 @@ class PlatformIntercept {
       ET_UNUSED const char* message,
       ET_UNUSED size_t length) {}
 
+  virtual void* allocate(ET_UNUSED size_t size) {
+    return nullptr;
+  }
+
+  virtual void free(ET_UNUSED void* ptr) {}
+
   virtual ~PlatformIntercept() = default;
 };
 
diff --git a/schema/CMakeLists.txt b/schema/CMakeLists.txt
index a69f751bf2a..5a4013f43e0 100644
--- a/schema/CMakeLists.txt
+++ b/schema/CMakeLists.txt
@@ -49,7 +49,8 @@ function(generate_program_schema _schema_srcs _schema_name)
   # and some users need an alignment larger than the default, which is typically
   # 32.
   target_compile_definitions(
-    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=1024)
+    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=1024
+  )
 
   target_include_directories(
     ${_schema_name}
diff --git a/schema/program.fbs b/schema/program.fbs
index cbdda2d3606..e3c7597fcdc 100644
--- a/schema/program.fbs
+++ b/schema/program.fbs
@@ -429,6 +429,7 @@ table Program {
   // Each constant is assigned an index into the table which are each individually aligned.
   // 0 index is reserved to be pointed to by non-constant Tensors.
   // If this field is non-empty, constant_segment.offsets must be empty.
+  // DEPRECATED: After D61996249 on 2024-09-05, no new PTE files will use this field.
   constant_buffer:[Buffer];
 
   // List of delegate data. Pointed to by BackendDelegateDataReference.
diff --git a/schema/targets.bzl b/schema/targets.bzl
index 2c797baa16b..40c6d8d5c8d 100644
--- a/schema/targets.bzl
+++ b/schema/targets.bzl
@@ -57,7 +57,7 @@ def define_common_targets():
         name = INPUT_SCALAR_TYPE,
         visibility = [
             "//executorch/exir/_serialize/...",
-            "//executorch/sdk/etdump/...",
+            "//executorch/devtools/etdump/...",
         ],
     )
 
diff --git a/sdk/etdump/emitter.h b/sdk/etdump/emitter.h
deleted file mode 100644
index 3910d3bd27b..00000000000
--- a/sdk/etdump/emitter.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#include <executorch/sdk/etdump/etdump_flatcc.h>
-#include <flatcc/flatcc_builder.h>
-
-#pragma once
-
-namespace torch {
-namespace executor {
-
-int et_flatcc_custom_init(
-    flatcc_builder_t* builder,
-    struct etdump_static_allocator* alloc);
-
-int etdump_static_allocator_builder_init(
-    flatcc_builder_t* builder,
-    struct etdump_static_allocator* alloc);
-
-void etdump_static_allocator_reset(struct etdump_static_allocator* alloc);
-
-} // namespace executor
-} // namespace torch
diff --git a/sdk/inspector/__init__.py b/sdk/inspector/__init__.py
deleted file mode 100644
index bef3d363d58..00000000000
--- a/sdk/inspector/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from executorch.sdk.inspector._inspector import Event, EventBlock, Inspector, PerfData
-from executorch.sdk.inspector._inspector_utils import TimeScale
-
-__all__ = ["Event", "EventBlock", "Inspector", "PerfData", "TimeScale"]
diff --git a/sdk/inspector/tests/TARGETS b/sdk/inspector/tests/TARGETS
deleted file mode 100644
index 374d2ea7538..00000000000
--- a/sdk/inspector/tests/TARGETS
+++ /dev/null
@@ -1,40 +0,0 @@
-load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
-
-oncall("executorch")
-
-python_unittest(
-    name = "inspector_test",
-    srcs = ["inspector_test.py"],
-    deps = [
-        "//executorch/exir:lib",
-        "//executorch/sdk:lib",
-        "//executorch/sdk/debug_format:et_schema",
-        "//executorch/sdk/etdump:schema_flatcc",
-        "//executorch/sdk/etrecord/tests:etrecord_test_library",
-        "//executorch/sdk/inspector:inspector",
-        "//executorch/sdk/inspector:lib",
-    ],
-)
-
-python_unittest(
-    name = "event_blocks_test",
-    srcs = ["event_blocks_test.py"],
-    deps = [
-        "//executorch/sdk/etdump:schema_flatcc",
-        "//executorch/sdk/inspector:inspector",
-        "//executorch/sdk/inspector:lib",
-    ],
-)
-
-python_unittest(
-    name = "inspector_utils_test",
-    srcs = ["inspector_utils_test.py"],
-    deps = [
-        "//executorch/sdk:lib",
-        "//executorch/sdk/debug_format:base_schema",
-        "//executorch/sdk/debug_format:et_schema",
-        "//executorch/sdk/etdump:schema_flatcc",
-        "//executorch/sdk/etrecord/tests:etrecord_test_library",
-        "//executorch/sdk/inspector:inspector_utils",
-    ],
-)
diff --git a/setup.py b/setup.py
index 58a9973c9f9..f6adb4f86c3 100644
--- a/setup.py
+++ b/setup.py
@@ -48,6 +48,7 @@
 
 import contextlib
 import os
+import platform
 import re
 import sys
 
@@ -162,6 +163,31 @@ def write_to_python_file(cls, path: str) -> None:
             fp.write("\n".join(lines) + "\n")
 
 
+# The build type is determined by the DEBUG environment variable. If DEBUG is
+# set to a non-empty value, the build type is Debug. Otherwise, the build type
+# is Release.
+def get_build_type(is_debug=None) -> str:
+    debug = int(os.environ.get("DEBUG", 0)) if is_debug is None else is_debug
+    cfg = "Debug" if debug else "Release"
+    return cfg
+
+
+def get_dynamic_lib_name(name: str) -> str:
+    if platform.system() == "Windows":
+        return name + ".dll"
+    elif platform.system() == "Darwin":
+        return "lib" + name + ".dylib"
+    else:
+        return "lib" + name + ".so"
+
+
+def get_executable_name(name: str) -> str:
+    if platform.system() == "Windows":
+        return name + ".exe"
+    else:
+        return name
+
+
 class _BaseExtension(Extension):
     """A base class that maps an abstract source to an abstract destination."""
 
@@ -189,9 +215,17 @@ def src_path(self, installer: "InstallerBuildExt") -> Path:
             installer: The InstallerBuildExt instance that is installing the
                 file.
         """
-        # TODO(dbort): share the cmake-out location with CustomBuild. Can get a
-        # handle with installer.get_finalized_command('build')
-        cmake_cache_dir: Path = Path().cwd() / installer.build_temp / "cmake-out"
+        # Share the cmake-out location with CustomBuild.
+        cmake_cache_dir = Path(installer.get_finalized_command("build").cmake_cache_dir)
+
+        cfg = get_build_type(installer.debug)
+
+        if os.name == "nt":
+            # Replace %BUILD_TYPE% with the current build type.
+            self.src = self.src.replace("%BUILD_TYPE%", cfg)
+        else:
+            # Remove %BUILD_TYPE% from the path.
+            self.src = self.src.replace("/%BUILD_TYPE%", "")
 
         # Construct the full source path, resolving globs. If there are no glob
         # pattern characters, this will just ensure that the source file exists.
@@ -212,17 +246,39 @@ class BuiltFile(_BaseExtension):
     `ext_modules`.
     """
 
-    def __init__(self, src: str, dst: str):
+    def __init__(
+        self,
+        src_dir: str,
+        src_name: str,
+        dst: str,
+        is_executable: bool = False,
+        is_dynamic_lib: bool = False,
+    ):
         """Initializes a BuiltFile.
 
         Args:
-            src: The path to the file to install, relative to the cmake-out
-                directory. May be an fnmatch-style glob that matches exactly one
-                file.
+            src_dir: The directory of the file to install, relative to the cmake-out
+                directory. A placeholder %BUILD_TYPE% will be replaced with the build
+                type for multi-config generators (like Visual Studio) where the build
+                output is in a subdirectory named after the build type. For single-
+                config generators (like Makefile Generators or Ninja), this placeholder
+                will be removed.
+            src_name: The name of the file to install
             dst: The path to install to, relative to the root of the pip
                 package. If dst ends in "/", it is treated as a directory.
                 Otherwise it is treated as a filename.
+            is_executable: If True, the file is an executable. This is used to
+                determine the destination filename for executable.
+            is_dynamic_lib: If True, the file is a dynamic library. This is used
+                to determine the destination filename for dynamic library.
         """
+        if is_executable and is_dynamic_lib:
+            raise ValueError("is_executable and is_dynamic_lib cannot be both True.")
+        if is_executable:
+            src_name = get_executable_name(src_name)
+        elif is_dynamic_lib:
+            src_name = get_dynamic_lib_name(src_name)
+        src = os.path.join(src_dir, src_name)
         # This is not a real extension, so use a unique name that doesn't look
         # like a module path. Some of setuptools's autodiscovery will look for
         # extension names with prefixes that match certain module paths.
@@ -360,12 +416,12 @@ def run(self):
             ("schema/scalar_type.fbs", "exir/_serialize/scalar_type.fbs"),
             ("schema/program.fbs", "exir/_serialize/program.fbs"),
             (
-                "sdk/bundled_program/schema/bundled_program_schema.fbs",
-                "sdk/bundled_program/serialize/bundled_program_schema.fbs",
+                "devtools/bundled_program/schema/bundled_program_schema.fbs",
+                "devtools/bundled_program/serialize/bundled_program_schema.fbs",
             ),
             (
-                "sdk/bundled_program/schema/scalar_type.fbs",
-                "sdk/bundled_program/serialize/scalar_type.fbs",
+                "devtools/bundled_program/schema/scalar_type.fbs",
+                "devtools/bundled_program/serialize/scalar_type.fbs",
             ),
         ]
         for src, dst in src_to_dst:
@@ -397,7 +453,7 @@ def __init__(self):
         self.saved_env = {}
 
     def __enter__(self):
-        if os.geteuid() == 0 and "HOME" in os.environ:
+        if os.name != "nt" and os.geteuid() == 0 and "HOME" in os.environ:
             log.info("temporarily unsetting HOME while running as root")
             self.saved_env["HOME"] = os.environ.pop("HOME")
         return self
@@ -432,8 +488,7 @@ def initialize_options(self):
     def run(self):
         self.dump_options()
 
-        debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
-        cfg = "Debug" if debug else "Release"
+        cfg = get_build_type(self.debug)
 
         # get_python_lib() typically returns the path to site-packages, where
         # all pip packages in the environment are installed.
@@ -508,6 +563,14 @@ def run(self):
                 item for item in os.environ["CMAKE_BUILD_ARGS"].split(" ") if item
             ]
 
+        # CMAKE_BUILD_TYPE variable specifies the build type (configuration) for
+        # single-configuration generators (e.g., Makefile Generators or Ninja).
+        # For multi-config generators (like Visual Studio), CMAKE_BUILD_TYPE
+        # isn’t directly applicable.
+        # During the build step, --config specifies the configuration to build
+        # for multi-config generators.
+        build_args += ["--config", cfg]
+
         # Put the cmake cache under the temp directory, like
         # "pip-out/temp.<plat>/cmake-out".
         cmake_cache_dir = os.path.join(repo_root, self.build_temp, "cmake-out")
@@ -545,6 +608,8 @@ def run(self):
             "build/pip_data_bin_init.py.in",
             os.path.join(bin_dir, "__init__.py"),
         )
+        # Share the cmake-out location with _BaseExtension.
+        self.cmake_cache_dir = cmake_cache_dir
 
         # Finally, run the underlying subcommands like build_py, build_ext.
         build.run(self)
@@ -552,11 +617,15 @@ def run(self):
 
 def get_ext_modules() -> List[Extension]:
     """Returns the set of extension modules to build."""
-
     ext_modules = []
     if ShouldBuild.flatc():
         ext_modules.append(
-            BuiltFile("third-party/flatbuffers/flatc", "executorch/data/bin/")
+            BuiltFile(
+                src_dir="third-party/flatbuffers/%BUILD_TYPE%/",
+                src_name="flatc",
+                dst="executorch/data/bin/",
+                is_executable=True,
+            )
         )
 
     if ShouldBuild.pybindings():
@@ -570,17 +639,20 @@ def get_ext_modules() -> List[Extension]:
         )
     if ShouldBuild.llama_custom_ops():
         ext_modules.append(
-            # Install the prebuilt library for custom ops used in llama.
             BuiltFile(
-                "extension/llm/custom_ops/libcustom_ops_aot_lib.*",
-                "executorch/extension/llm/custom_ops/",
+                src_dir="extension/llm/custom_ops/%BUILD_TYPE%/",
+                src_name="custom_ops_aot_lib",
+                dst="executorch/extension/llm/custom_ops",
+                is_dynamic_lib=True,
             )
         )
         ext_modules.append(
             # Install the prebuilt library for quantized ops required by custom ops.
             BuiltFile(
-                "kernels/quantized/libquantized_ops_aot_lib.*",
-                "executorch/kernels/quantized/",
+                src_dir="kernels/quantized/%BUILD_TYPE%/",
+                src_name="quantized_ops_aot_lib",
+                dst="executorch/kernels/quantized/",
+                is_dynamic_lib=True,
             )
         )
 
@@ -606,8 +678,8 @@ def get_ext_modules() -> List[Extension]:
         "executorch/extension": "extension",
         "executorch/kernels/quantized": "kernels/quantized",
         "executorch/schema": "schema",
-        "executorch/sdk": "sdk",
-        "executorch/sdk/bundled_program": "sdk/bundled_program",
+        "executorch/devtools": "devtools",
+        "executorch/devtools/bundled_program": "devtools/bundled_program",
         "executorch/util": "util",
         # Note: This will install a top-level module called "serializer",
         # which seems too generic and might conflict with other pip packages.
diff --git a/shim/BUCK b/shim/BUCK
index 56fe035920b..365a7bc0765 100644
--- a/shim/BUCK
+++ b/shim/BUCK
@@ -1,3 +1,4 @@
+load("@prelude//platforms:defs.bzl", "execution_platform")
 load("@prelude//toolchains:cxx.bzl", "system_cxx_toolchain")
 load("@prelude//toolchains:genrule.bzl", "system_genrule_toolchain")
 load("@prelude//toolchains:go.bzl", "system_go_toolchain")
@@ -55,3 +56,21 @@ remote_test_execution_toolchain(
     name = "remote_test_execution",
     visibility = ["PUBLIC"],
 )
+
+execution_platform(
+    name = "android-arm64",
+    cpu_configuration = "prelude//cpu:arm64",
+    os_configuration = "prelude//os:android",
+    # REVIEW: not sure if this is correct
+    use_windows_path_separators = host_info().os.is_windows,
+    visibility = ["PUBLIC"],
+)
+
+execution_platform(
+    name = "android-x86_64",
+    cpu_configuration = "prelude//cpu:x86_64",
+    os_configuration = "prelude//os:android",
+    # REVIEW: not sure if this is correct
+    use_windows_path_separators = host_info().os.is_windows,
+    visibility = ["PUBLIC"],
+)
diff --git a/shim/tools/build_defs/fb_native_wrapper.bzl b/shim/tools/build_defs/fb_native_wrapper.bzl
new file mode 100644
index 00000000000..d67b9384fe9
--- /dev/null
+++ b/shim/tools/build_defs/fb_native_wrapper.bzl
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under both the MIT license found in the
+# LICENSE-MIT file in the root directory of this source tree and the Apache
+# License, Version 2.0 found in the LICENSE-APACHE file in the root directory
+# of this source tree.
+
+fb_native = struct(
+    config_setting = native.config_setting,
+)
diff --git a/shim/xplat/executorch/build/env_interface.bzl b/shim/xplat/executorch/build/env_interface.bzl
index 27d2887b668..b6e30cd9f65 100644
--- a/shim/xplat/executorch/build/env_interface.bzl
+++ b/shim/xplat/executorch/build/env_interface.bzl
@@ -41,7 +41,7 @@ _EXTERNAL_DEPS = {
     "libtorch_python": "//third-party:libtorch_python",
     "prettytable": "//third-party:prettytable",
     "pybind11": "//third-party:pybind11",
-    "re2": [],  # TODO(larryliu0820): Add support
+    "re2": "//extension/llm/third-party:re2",
     "sentencepiece-py": [],
     # Core C++ PyTorch functionality like Tensor and ScalarType.
     "torch-core-cpp": "//third-party:libtorch",
@@ -118,7 +118,8 @@ def _remove_platform_specific_args(kwargs):
     """
     keys = []
     for key in kwargs:
-        if key.endswith("_platform_preprocessor_flags") or key.endswith("_platform_deps") or key.startswith("fbobjc"):
+        if (key.endswith("_platform_preprocessor_flags") or key.endswith("_platform_deps") or
+            key.startswith("fbobjc") or key.endswith("_platform_compiler_flags")):
             keys.append(key)
     for key in keys:
         kwargs.pop(key)
diff --git a/shim/xplat/executorch/extension/pybindings/pybindings.bzl b/shim/xplat/executorch/extension/pybindings/pybindings.bzl
index f62c567ba40..52191eb978a 100644
--- a/shim/xplat/executorch/extension/pybindings/pybindings.bzl
+++ b/shim/xplat/executorch/extension/pybindings/pybindings.bzl
@@ -10,29 +10,27 @@ MODELS_ATEN_OPS_LEAN_MODE_GENERATED_LIB = [
 PORTABLE_MODULE_DEPS = [
     "//executorch/runtime/kernel:operator_registry",
     "//executorch/runtime/executor:program",
-    "//executorch/sdk/bundled_program/schema:bundled_program_schema_fbs",
+    "//executorch/devtools/bundled_program/schema:bundled_program_schema_fbs",
     "//executorch/extension/aten_util:aten_bridge",
-    "//executorch/sdk/bundled_program:runtime",
+    "//executorch/devtools/bundled_program:runtime",
     "//executorch/extension/data_loader:buffer_data_loader",
     "//executorch/extension/data_loader:mmap_data_loader",
     "//executorch/extension/memory_allocator:malloc_memory_allocator",
-    "//executorch/util:util",
     "//executorch/runtime/executor/test:test_backend_compiler_lib",
-    "//executorch/sdk/etdump:etdump_flatcc",
+    "//executorch/devtools/etdump:etdump_flatcc",
 ] + get_all_cpu_backend_targets()
 
 ATEN_MODULE_DEPS = [
     "//executorch/runtime/kernel:operator_registry",
     "//executorch/runtime/executor:program_aten",
     "//executorch/runtime/core/exec_aten:lib",
-    "//executorch/sdk/bundled_program/schema:bundled_program_schema_fbs",
+    "//executorch/devtools/bundled_program/schema:bundled_program_schema_fbs",
     "//executorch/extension/data_loader:buffer_data_loader",
     "//executorch/extension/data_loader:mmap_data_loader",
     "//executorch/extension/memory_allocator:malloc_memory_allocator",
-    "//executorch/util:read_file",
-    "//executorch/sdk/bundled_program:runtime_aten",
+    "//executorch/devtools/bundled_program:runtime_aten",
     "//executorch/runtime/executor/test:test_backend_compiler_lib_aten",
-    "//executorch/sdk/etdump:etdump_flatcc",
+    "//executorch/devtools/etdump:etdump_flatcc",
 ]
 
 # Generated lib for all ATen ops with aten kernel used by models in model inventory
@@ -54,8 +52,8 @@ def executorch_pybindings(python_module_name, srcs = [], cppdeps = [], visibilit
             "-DEXECUTORCH_PYTHON_MODULE_NAME={}".format(python_module_name),
         ],
         deps = [
+            "//executorch/exir:_warnings",
             "//executorch/runtime/core:core",
-            "//executorch/util:read_file",
         ] + cppdeps,
         external_deps = [
             "pybind11",
diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
index 347094435c6..ef8f936571c 100644
--- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -412,6 +412,12 @@ ATEN_OPS = (
             ":vec_ops",
         ],
     ),
+    op_target(
+        name = "op_convolution_backward",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:kernel_ops_util",
+        ],
+    ),
     op_target(
         name = "op_copy",
         deps = [
@@ -559,6 +565,12 @@ ATEN_OPS = (
             ":scalar_utils",
         ],
     ),
+    op_target(
+        name = "op_gather",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:index_util",
+        ],
+    ),
     op_target(
         name = "op_ge",
         deps = [
@@ -796,6 +808,12 @@ ATEN_OPS = (
             ":scalar_utils",
         ],
     ),
+    op_target(
+        name = "op_narrow_copy",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:slice_util",
+        ],
+    ),
     op_target(
         name = "op_native_batch_norm",
         deps = [
@@ -835,7 +853,6 @@ ATEN_OPS = (
         deps = [
             ":scalar_utils",
             "//executorch/kernels/portable/cpu/util:index_util",
-            "//executorch/kernels/portable/cpu/util:kernel_ops_util",
         ],
     ),
     op_target(
@@ -862,6 +879,12 @@ ATEN_OPS = (
             "//executorch/kernels/portable/cpu/util:copy_ops_util",
         ],
     ),
+    op_target(
+        name = "op_pixel_unshuffle",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:copy_ops_util",
+        ],
+    ),
     op_target(
         name = "op_pow",
         deps = [
@@ -968,11 +991,17 @@ ATEN_OPS = (
         name = "op_scalar_tensor",
         deps = [":scalar_utils"],
     ),
+    op_target(
+        name = "op_scatter",
+        deps = [
+            ":scalar_utils",
+            "//executorch/kernels/portable/cpu/util:index_util",
+        ],
+    ),
     op_target(
         name = "op_scatter_add",
         deps = [
             "//executorch/kernels/portable/cpu/util:index_util",
-            "//executorch/kernels/portable/cpu/util:kernel_ops_util",
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
         ],
@@ -987,10 +1016,9 @@ ATEN_OPS = (
     op_target(
         name = "op_select_scatter",
         deps = [
-            "//executorch/kernels/portable/cpu/util:kernel_ops_util",
+            "//executorch/kernels/portable/cpu/util:index_util",
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
-            "//executorch/kernels/portable/cpu/util:index_util",
         ],
     ),
     op_target(
@@ -1020,15 +1048,13 @@ ATEN_OPS = (
     op_target(
         name = "op_slice_copy",
         deps = [
-            "//executorch/kernels/portable/cpu/util:copy_ops_util",
-            "//executorch/kernels/portable/cpu/util:index_util",
+            "//executorch/kernels/portable/cpu/util:slice_util",
         ],
     ),
     op_target(
         name = "op_slice_scatter",
         deps = [
-            "//executorch/kernels/portable/cpu/util:index_util",
-            "//executorch/kernels/portable/cpu/util:kernel_ops_util",
+            "//executorch/kernels/portable/cpu/util:slice_util",
         ],
     ),
     op_target(
@@ -1110,6 +1136,9 @@ ATEN_OPS = (
             "//executorch/kernels/portable/cpu/util:copy_ops_util",
         ],
     ),
+    op_target(
+        name = "op_topk",
+    ),
     op_target(
         name = "op_transpose_copy",
         deps = ["//executorch/kernels/portable/cpu/util:transpose_util"],
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 1cd4c824978..b651bd2dd93 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -19,8 +19,7 @@
 cmake_minimum_required(VERSION 3.19)
 project(size_test)
 
-# Use C++11 for size test.
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
 
@@ -62,8 +61,9 @@ endif()
 #
 add_executable(size_test_all_ops ${_size_test__srcs})
 target_link_options_shared_lib(portable_ops_lib)
-target_link_libraries(size_test_all_ops executorch
-  portable_ops_lib portable_kernels)
+target_link_libraries(
+  size_test_all_ops executorch portable_ops_lib portable_kernels
+)
 if(CMAKE_BUILD_TYPE EQUAL "Release")
   target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections")
 endif()
diff --git a/test/build_size_test.sh b/test/build_size_test.sh
index 540b78e9f05..428e351cf08 100644
--- a/test/build_size_test.sh
+++ b/test/build_size_test.sh
@@ -11,29 +11,12 @@ set -e
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh"
 
-# Set compile flags for Clang and GCC.
-# -Wno-gnu allows us to use gnu statement-expressions.
-# -Werror -Wc++17* ensure we do not use features from C++17.
-CXX_FLAGS="-Wno-gnu"
-compiler=$(cc --version)
-if [[ $compiler == *"clang"* ]]; then
-  CXX_FLAGS="$CXX_FLAGS -Werror -Wc++17-extensions -Wc++14-extensions"
-elif [[ $compiler == *"cc"* ]]; then
-  CXX_FLAGS="$CXX_FLAGS -Werror -Wc++17-compat -Wc++14-compat"
-else
-  echo "Unknown compiler: $compiler"
-  exit 1
-fi
-echo "Using compiler $compiler with flags $CXX_FLAGS"
-
 cmake_install_executorch_lib() {
   echo "Installing libexecutorch.a"
   rm -rf cmake-out
 
   retry cmake -DBUCK2="$BUCK2" \
-          -DCMAKE_CXX_STANDARD=11 \
           -DCMAKE_CXX_STANDARD_REQUIRED=ON \
-          -DCMAKE_CXX_FLAGS="$CXX_FLAGS" \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
           -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
diff --git a/test/end2end/TARGETS b/test/end2end/TARGETS
index 8c0885e32eb..fdac0e4887a 100644
--- a/test/end2end/TARGETS
+++ b/test/end2end/TARGETS
@@ -42,6 +42,9 @@ python_unittest(
         ":exported_module",
         ":register_scratch_meta_fns",
         "//caffe2:torch",
+        "//executorch/devtools:lib",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program/serialize:lib",
         "//executorch/exir:dynamic_shape",
         "//executorch/exir:lib",
         "//executorch/exir:memory",
@@ -57,9 +60,6 @@ python_unittest(
         "//executorch/exir/tests:transformer",
         "//executorch/extension/pybindings:aten_lib",
         "//executorch/extension/pytree:pybindings",
-        "//executorch/sdk:lib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program/serialize:lib",
     ],
 )
 
@@ -73,6 +73,9 @@ python_unittest(
         ":exported_module",
         ":register_scratch_meta_fns",
         "//caffe2:torch",
+        "//executorch/devtools:lib",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program/serialize:lib",
         "//executorch/exir:dynamic_shape",
         "//executorch/exir:lib",
         "//executorch/exir:memory",
@@ -88,8 +91,5 @@ python_unittest(
         "//executorch/exir/tests:transformer",
         "//executorch/extension/pybindings:portable_lib",
         "//executorch/extension/pytree:pybindings",
-        "//executorch/sdk:lib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program/serialize:lib",
     ],
 )
diff --git a/test/end2end/exported_module.py b/test/end2end/exported_module.py
index 656b5705122..2365450ae59 100644
--- a/test/end2end/exported_module.py
+++ b/test/end2end/exported_module.py
@@ -67,7 +67,6 @@ def export(
         ignore_to_out_var_failure: bool = False,
         dynamic_memory_planning_mode: DynamicMemoryPlanningMode = DynamicMemoryPlanningMode.UPPER_BOUND,
         capture_config=None,
-        extract_constant_segment: bool = True,
         skip_type_promotion: bool = False,
         export_joint_graph: bool = False,
     ) -> "ExportedModule":
@@ -148,7 +147,7 @@ def return_wrapper():
             for method in methods:
                 method_name_to_dynamic_shapes[method] = trace_dynamic_shapes
 
-        memory_planning_pass = MemoryPlanningPass("greedy")
+        memory_planning_pass = MemoryPlanningPass()
         if hasattr(eager_module, "get_memory_planning_pass"):
             memory_planning_pass = eager_module.get_memory_planning_pass()
 
@@ -206,7 +205,6 @@ def __init__(self, method):
                 dynamic_memory_planning_mode=dynamic_memory_planning_mode,
                 memory_planning_pass=memory_planning_pass,
                 to_out_var_pass=ToOutVarPass(ignore_to_out_var_failure),
-                extract_constant_segment=extract_constant_segment,
             )
         )
 
diff --git a/test/models/deprecated/ModuleLinear-no-constant-segment.pte b/test/models/deprecated/ModuleLinear-no-constant-segment.pte
new file mode 100644
index 00000000000..42b8643fb91
Binary files /dev/null and b/test/models/deprecated/ModuleLinear-no-constant-segment.pte differ
diff --git a/test/models/deprecated/README.md b/test/models/deprecated/README.md
new file mode 100644
index 00000000000..44ed9647735
--- /dev/null
+++ b/test/models/deprecated/README.md
@@ -0,0 +1,14 @@
+## Deprecated Models
+
+This readme documents deprecated models that remain compatible with versions of the ExecuTorch runtime.
+
+ModuleLinear-no-constant-segment.pte
+- This file contains constants stored in the constant_buffer, which was deprecated in D61996249, [#5096](https://github.com/pytorch/executorch/pull/5096) on 2024-09-06. Now, constants are stored in a separate segment.
+- This .pte file was generated internally using hg commit hash rFBS5e49dc0319b1d2d9969bbcef92857ab76a899c34, with command:
+    ```
+    buck2 build fbcode//executorch/test/models:exported_programs[ModuleLinear-no-constant-segment.pte] --show-output
+    ```
+- In OSS, the same .pte file can be generated with https://github.com/pytorch/executorch/commit/cea5abbcdded, via:
+    ```
+    python -m test.models.export_program --modules "ModuleLinear" --outdir .
+    ```
diff --git a/test/models/deprecated/TARGETS b/test/models/deprecated/TARGETS
new file mode 100644
index 00000000000..369fc3c4067
--- /dev/null
+++ b/test/models/deprecated/TARGETS
@@ -0,0 +1,12 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+runtime.export_file(
+    name = "ModuleLinear-no-constant-segment.pte",
+    src = "ModuleLinear-no-constant-segment.pte",
+    visibility = [
+        "//executorch/runtime/executor/test/...",
+        "//executorch/test/...",
+    ],
+)
diff --git a/test/models/export_program.py b/test/models/export_program.py
index 7941af376fe..caea394f33c 100644
--- a/test/models/export_program.py
+++ b/test/models/export_program.py
@@ -121,7 +121,6 @@ def get_dynamic_shapes(self):
 
     def get_memory_planning_pass(self):
         return MemoryPlanningPass(
-            memory_planning_algo="greedy",
             alloc_graph_input=False,
             alloc_graph_output=False,
         )
@@ -190,7 +189,6 @@ def export_joint():
 
 def export_module_to_program(
     module_class: Type[nn.Module],
-    extract_constant_segment: bool,
     skip_type_promotion: bool,
 ):
     """Exports the module and returns the serialized program data."""
@@ -211,7 +209,6 @@ def export_module_to_program(
     module = ExportedModule.export(
         module_class,
         methods,
-        extract_constant_segment=extract_constant_segment,
         skip_type_promotion=skip_type_promotion,
         export_joint_graph=export_joint,
         **export_kwargs,
@@ -259,18 +256,15 @@ def main() -> None:
             # Skip type promotion to keep the model in fp16.
             # Type promotion will convert to fp32.
             skip_type_promotion = True
-        for extract_constant_segment in (True, False):
-            suffix = "" if extract_constant_segment else "-no-constant-segment"
-            outfile = os.path.join(args.outdir, f"{module_name}{suffix}.pte")
-            with open(outfile, "wb") as fp:
-                fp.write(
-                    export_module_to_program(
-                        module_class,
-                        extract_constant_segment=extract_constant_segment,
-                        skip_type_promotion=skip_type_promotion,
-                    )
+        outfile = os.path.join(args.outdir, f"{module_name}.pte")
+        with open(outfile, "wb") as fp:
+            fp.write(
+                export_module_to_program(
+                    module_class,
+                    skip_type_promotion=skip_type_promotion,
                 )
-            print(f"Exported {module_name} and wrote program data to {outfile}")
+            )
+        print(f"Exported {module_name} and wrote program data to {outfile}")
 
 
 if __name__ == "__main__":
diff --git a/test/models/generate_linear_out_bundled_program.py b/test/models/generate_linear_out_bundled_program.py
index 9201e43adff..93fd1445ef5 100644
--- a/test/models/generate_linear_out_bundled_program.py
+++ b/test/models/generate_linear_out_bundled_program.py
@@ -17,15 +17,15 @@
 from typing import List
 
 import torch
+from executorch.devtools import BundledProgram
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
 from executorch.exir import ExecutorchBackendConfig, to_edge
 
 from executorch.exir.passes import MemoryPlanningPass, ToOutVarPass
 from executorch.exir.print_program import pretty_print
-from executorch.sdk import BundledProgram
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
 
 from executorch.test.models.linear_model import LinearModel
 from torch.export import export
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
index ad907304edf..aea47c9e036 100644
--- a/test/models/targets.bzl
+++ b/test/models/targets.bzl
@@ -22,9 +22,9 @@ def define_common_targets():
         deps = [
             ":linear_model",
             "//caffe2:torch",
-            "//executorch/sdk/bundled_program:config",
-            "//executorch/sdk:lib",
-            "//executorch/sdk/bundled_program/serialize:lib",
+            "//executorch/devtools/bundled_program:config",
+            "//executorch/devtools:lib",
+            "//executorch/devtools/bundled_program/serialize:lib",
             "//executorch/exir:lib",
             "//executorch/exir/_serialize:lib",
         ],
diff --git a/test/multi_runner.cpp b/test/multi_runner.cpp
deleted file mode 100644
index 8ea6b413953..00000000000
--- a/test/multi_runner.cpp
+++ /dev/null
@@ -1,378 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/**
- * @file
- *
- * Creates multiple Executor instances at the same time, demonstrating that the
- * same process can handle multiple runtimes at once.
- *
- * Usage:
- *   multi_runner --models=<model.pte>[,<m2.pte>[,...]] [--num_instances=<num>]
- */
-
-#include <gflags/gflags.h>
-
-#include <sys/stat.h>
-
-#include <cassert>
-#include <condition_variable>
-#include <cstdio>
-#include <functional>
-#include <memory>
-#include <sstream>
-#include <thread>
-#include <tuple>
-
-#include <executorch/extension/data_loader/buffer_data_loader.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/core/result.h>
-#include <executorch/runtime/executor/method.h>
-#include <executorch/runtime/executor/program.h>
-#include <executorch/runtime/executor/test/managed_memory_manager.h>
-#include <executorch/runtime/platform/log.h>
-#include <executorch/runtime/platform/runtime.h>
-#include <executorch/util/read_file.h>
-#include <executorch/util/util.h>
-
-DEFINE_string(
-    models,
-    "",
-    "Comma-separated list of paths to serialized ExecuTorch model files");
-DEFINE_int32(
-    num_instances,
-    10,
-    "Number of Executor instances to create in parallel, for each model");
-
-static bool validate_path_list(
-    const char* flagname,
-    const std::string& path_list);
-DEFINE_validator(models, &validate_path_list);
-
-static bool validate_positive_int32(const char* flagname, int32_t val);
-DEFINE_validator(num_instances, &validate_positive_int32);
-
-namespace {
-using torch::executor::DataLoader;
-using torch::executor::Error;
-using torch::executor::FreeableBuffer;
-using torch::executor::MemoryAllocator;
-using torch::executor::MemoryManager;
-using torch::executor::Method;
-using torch::executor::Program;
-using torch::executor::Result;
-using torch::executor::testing::ManagedMemoryManager;
-using torch::executor::util::BufferDataLoader;
-
-/**
- * A model that has been loaded and has had its execution plan and inputs
- * prepared. Can be run once.
- *
- * Creates and owns the underyling state, making things easier to manage.
- */
-class PreparedModel final {
- public:
-  PreparedModel(
-      const std::string& name,
-      const void* model_data,
-      size_t model_data_size,
-      size_t non_const_mem_bytes,
-      size_t runtime_mem_bytes)
-      : name_(name),
-        loader_(model_data, model_data_size),
-        program_(load_program_or_die(loader_)),
-        memory_manager_(non_const_mem_bytes, runtime_mem_bytes),
-        method_(load_method_or_die(program_, &memory_manager_.get())),
-        has_run_(false) {
-    inputs_ = torch::executor::util::PrepareInputTensors(method_);
-  }
-
-  void run() {
-    ET_CHECK_MSG(!has_run_, "A PreparedModel may only be run once");
-    has_run_ = true;
-
-    Error status = method_.execute();
-    ET_CHECK_MSG(
-        status == Error::Ok,
-        "plan.execute() failed with status 0x%" PRIx32,
-        status);
-
-    // TODO(T131578656): Do something with the outputs.
-  }
-
-  const std::string& name() const {
-    return name_;
-  }
-
-  ~PreparedModel() {
-    torch::executor::util::FreeInputs(inputs_);
-  }
-
- private:
-  static Program load_program_or_die(DataLoader& loader) {
-    Result<Program> program = Program::load(&loader);
-    ET_CHECK(program.ok());
-    return std::move(program.get());
-  }
-
-  static Method load_method_or_die(
-      const Program& program,
-      MemoryManager* memory_manager) {
-    Result<Method> method = program.load_method("forward", memory_manager);
-    ET_CHECK(method.ok());
-    return std::move(method.get());
-  }
-
-  const std::string name_;
-  BufferDataLoader loader_; // Needs to outlive program_
-  Program program_; // Needs to outlive executor_
-  ManagedMemoryManager memory_manager_; // Needs to outlive executor_
-  Method method_;
-  exec_aten::ArrayRef<void*> inputs_;
-
-  bool has_run_;
-};
-
-/**
- * Creates PreparedModels based on the provided serialized data and memory
- * parameters.
- */
-class ModelFactory {
- public:
-  ModelFactory(
-      const std::string& name, // For debugging
-      std::shared_ptr<const char> model_data,
-      size_t model_data_size,
-      size_t non_const_mem_bytes = 40 * 1024U * 1024U, // 40 MB
-      size_t runtime_mem_bytes = 2 * 1024U * 1024U) // 2 MB
-      : name_(name),
-        model_data_(model_data),
-        model_data_size_(model_data_size),
-        non_const_mem_bytes_(non_const_mem_bytes),
-        runtime_mem_bytes_(runtime_mem_bytes) {}
-
-  std::unique_ptr<PreparedModel> prepare(
-      std::string_view name_affix = "") const {
-    return std::make_unique<PreparedModel>(
-        name_affix.empty() ? name_ : std::string(name_affix) + ":" + name_,
-        model_data_.get(),
-        model_data_size_,
-        non_const_mem_bytes_,
-        runtime_mem_bytes_);
-  }
-
-  const std::string& name() const {
-    return name_;
-  }
-
- private:
-  const std::string name_;
-  std::shared_ptr<const char> model_data_;
-
-  const size_t model_data_size_;
-  const size_t non_const_mem_bytes_;
-  const size_t runtime_mem_bytes_;
-};
-
-/// Synchronizes a set of model threads as they walk through prepare/run states.
-class Synchronizer {
- public:
-  explicit Synchronizer(size_t total_threads)
-      : total_threads_(total_threads), state_(State::INIT_THREAD) {}
-
-  /// The states for threads to move through. Must advance in order.
-  enum class State {
-    /// Initial state.
-    INIT_THREAD,
-
-    /// Thread is ready to prepare its model instance.
-    PREPARE_MODEL,
-
-    /// Thread is ready to run its model instance.
-    RUN_MODEL,
-  };
-
-  /// Wait until all threads have requested to advance to this state, then
-  /// advance all of them.
-  void advance_to(State new_state) {
-    std::unique_lock<std::mutex> lock(lock_);
-
-    // Enforce valid state machine transitions.
-    assert(
-        (new_state == State::PREPARE_MODEL && state_ == State::INIT_THREAD) ||
-        (new_state == State::RUN_MODEL && state_ == State::PREPARE_MODEL));
-
-    // Indicate that this thread is ready to move to the new state.
-    num_ready_++;
-    if (num_ready_ == total_threads_) {
-      // We were the last thread to become ready. Tell all threads to
-      // move to the next state.
-      state_ = new_state;
-      num_ready_ = 0;
-      cv_.notify_all();
-    } else {
-      // Wait until all other threads are ready.
-      cv_.wait(lock, [=] { return this->state_ == new_state; });
-    }
-  }
-
- private:
-  /// The total number of threads to wait for.
-  const size_t total_threads_;
-
-  /// Locks all mutable fields in this class.
-  std::mutex lock_;
-
-  /// The number of threads that are ready to move to the next state.
-  size_t num_ready_ = 0;
-
-  /// The state that all threads should be in.
-  State state_;
-
-  /// Signals threads to check for state updates.
-  std::condition_variable cv_;
-};
-
-/**
- * Waits for all threads to begin running; prepares a model and waits for all
- * threads to finish preparation; runs the model and exits.
- */
-void model_thread(ModelFactory& factory, Synchronizer& sync, size_t thread_id) {
-  ET_LOG(
-      Info,
-      "[%zu] Thread has started for %s.",
-      thread_id,
-      factory.name().c_str());
-
-  sync.advance_to(Synchronizer::State::PREPARE_MODEL);
-
-  // Create and prepare our model instance.
-  ET_LOG(Info, "[%zu] Preparing %s...", thread_id, factory.name().c_str());
-  std::unique_ptr<PreparedModel> model =
-      factory.prepare(/*name_affix=*/std::to_string(thread_id));
-  ET_LOG(Info, "[%zu] Prepared %s.", thread_id, model->name().c_str());
-
-  sync.advance_to(Synchronizer::State::RUN_MODEL);
-
-  // Run our model.
-  ET_LOG(Info, "[%zu] Running %s...", thread_id, model->name().c_str());
-  model->run();
-  ET_LOG(
-      Info, "[%zu] Finished running %s...", thread_id, model->name().c_str());
-
-  // TODO(T131578656): Check the model output.
-}
-
-/**
- * Splits the provided string on `,` and returns a vector of the non-empty
- * elements. Does not string whitespace.
- */
-std::vector<std::string> split_string_list(const std::string& list) {
-  std::vector<std::string> items;
-  std::stringstream sstream(list);
-  while (sstream.good()) {
-    std::string item;
-    getline(sstream, item, ',');
-    if (!item.empty()) {
-      items.push_back(item);
-    }
-  }
-  return items;
-}
-
-} // namespace
-
-int main(int argc, char** argv) {
-  torch::executor::runtime_init();
-
-  // Parse and extract flags.
-  gflags::SetUsageMessage(
-      "Creates multiple Executor instances at the same time, demonstrating "
-      "that the same process can handle multiple runtimes at once.");
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  std::vector<std::string> model_paths = split_string_list(FLAGS_models);
-  size_t num_instances = FLAGS_num_instances;
-
-  // Create a factory for each model provided on the commandline.
-  std::vector<std::unique_ptr<ModelFactory>> factories;
-  for (const auto& model_path : model_paths) {
-    std::shared_ptr<char> file_data;
-    size_t file_size;
-    Error err = torch::executor::util::read_file_content(
-        model_path.c_str(), &file_data, &file_size);
-    ET_CHECK(err == Error::Ok);
-    factories.push_back(std::make_unique<ModelFactory>(
-        /*name=*/model_path, file_data, file_size));
-  }
-
-  // Spawn threads to prepare and run separate instances of the models in
-  // parallel.
-  const size_t num_threads = factories.size() * num_instances;
-  Synchronizer state(num_threads);
-  std::vector<std::thread> threads;
-  size_t thread_id = 0; // Unique ID for every thread.
-  ET_LOG(Info, "Creating %zu threads...", num_threads);
-  for (const auto& factory : factories) {
-    for (size_t i = 0; i < num_instances; ++i) {
-      threads.push_back(std::thread(
-          model_thread, std::ref(*factory), std::ref(state), thread_id++));
-    }
-  }
-
-  // Wait for all threads to finish.
-  ET_LOG(Info, "Waiting for %zu threads to exit...", threads.size());
-  for (auto& thread : threads) {
-    thread.join();
-  }
-  ET_LOG(Info, "All %zu threads exited.", threads.size());
-}
-
-//
-// Flag validation
-//
-
-/// Returns true if the specified path exists in the filesystem.
-static bool path_exists(const std::string& path) {
-  struct stat st;
-  return stat(path.c_str(), &st) == 0;
-}
-
-/// Returns true if `path_list` contains a comma-separated list of at least one
-/// path that exists in the filesystem.
-static bool validate_path_list(
-    const char* flagname,
-    const std::string& path_list) {
-  const std::vector<std::string> paths = split_string_list(path_list);
-  if (paths.empty()) {
-    fprintf(
-        stderr, "Must specify at least one valid path with --%s\n", flagname);
-    return false;
-  }
-  for (const auto& path : split_string_list(path_list)) {
-    if (!path_exists(path)) {
-      fprintf(
-          stderr,
-          "Path '%s' does not exist in --%s='%s'\n",
-          path.c_str(),
-          flagname,
-          path_list.c_str());
-      return false;
-    }
-  }
-  return true;
-}
-
-/// Returns true if `val` is positive.
-static bool validate_positive_int32(const char* flagname, int32_t val) {
-  if (val <= 0) {
-    fprintf(
-        stderr, "Value must be positive for --%s=%" PRId32 "\n", flagname, val);
-    return false;
-  }
-  return true;
-}
diff --git a/test/relocatable_runner.cpp b/test/relocatable_runner.cpp
deleted file mode 100644
index 47616dfa698..00000000000
--- a/test/relocatable_runner.cpp
+++ /dev/null
@@ -1,332 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <memory>
-#include <vector>
-
-#include <executorch/extension/data_loader/buffer_data_loader.h>
-#include <executorch/runtime/executor/method.h>
-#include <executorch/runtime/executor/program.h>
-#include <executorch/runtime/platform/log.h>
-#include <executorch/runtime/platform/runtime.h>
-#include <executorch/util/read_file.h>
-#include <executorch/util/util.h>
-
-#include <gflags/gflags.h>
-
-using namespace torch::executor;
-
-/**
- * @file
- *
- * In some hardware environments, the same model may run on different cores for
- * different inference requests. The same core may also see a power-cycle (i.e.,
- * power down and then back up) in between two inference requests.
- *
- * For ExecuTorch to work efficiently in these environments, we want to
- * initialize the Method once once for the model and avoid re-initializing it
- * for every inference. This can be achieved by restricting the runtime contexts
- * (torch::executor::Program and torch::executor::Method) to live in a
- * pre-allocated, shared, and persistent memory.
- *
- * This tool demonstrates that the memory can be managed this way.
- */
-
-static uint8_t method_allocator_pool[2 * 1024U * 1024U]; // 4 MB
-
-#define MAX_INPUTS_PER_MODEL 16
-#define MAX_OUTPUTS_PER_MODEL 8
-
-DEFINE_string(
-    model_path,
-    "model.pte",
-    "Model serialized in flatbuffer format.");
-
-// These functions represent the work done on a worker core.
-namespace worker {
-
-Program* load_program(
-    const void* file_data,
-    size_t file_data_len,
-    MemoryAllocator& allocator) {
-  // Wrap the data in a DataLoader. The Program will take a pointer to it, so it
-  // must live for at least as long as the Program instance.
-  auto loader = allocator.allocateInstance<util::BufferDataLoader>();
-  ET_CHECK(loader != nullptr);
-  new (loader) util::BufferDataLoader(file_data, file_data_len);
-
-  // Load the program.
-  Result<Program> program_result = Program::load(loader);
-  ET_CHECK(program_result.ok());
-
-  // Move the Program into worker memory.
-  auto program = allocator.allocateInstance<Program>();
-  ET_CHECK(program != nullptr);
-  new (program) Program(std::move(program_result.get()));
-
-  return program;
-}
-
-MemoryManager* create_memory_manager(
-    MethodMeta* method_meta,
-    MemoryAllocator& worker_allocator) {
-  // Create the runtime allocator.
-  auto* method_allocator = worker_allocator.allocateInstance<MemoryAllocator>();
-  ET_CHECK(method_allocator != nullptr);
-  new (method_allocator)
-      MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool);
-
-  // Create the memory planned buffers.
-  size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
-  Span<uint8_t>* memory_planned_buffers =
-      worker_allocator.allocateList<Span<uint8_t>>(num_memory_planned_buffers);
-  ET_CHECK(memory_planned_buffers != nullptr);
-  for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
-    const size_t buffer_size =
-        method_meta->memory_planned_buffer_size(id).get();
-    ET_LOG(
-        Info, "Setting up planned buffer id %zu, size %zu.", id, buffer_size);
-    void* buffer = worker_allocator.allocate(buffer_size);
-    ET_CHECK(buffer != nullptr);
-    memory_planned_buffers[id] = {(uint8_t*)buffer, buffer_size};
-    ET_LOG(
-        Info,
-        "Created memory_planned_buffers with size %zu and addr %p",
-        buffer_size,
-        buffer);
-  }
-  auto* planned_memory =
-      worker_allocator.allocateInstance<HierarchicalAllocator>();
-  ET_CHECK(planned_memory != nullptr);
-  new (planned_memory) HierarchicalAllocator(
-      {memory_planned_buffers, num_memory_planned_buffers});
-
-  // The constant allocator is not currently used, but must be provided.
-  auto* const_allocator = worker_allocator.allocateInstance<MemoryAllocator>();
-  ET_CHECK(const_allocator != nullptr);
-  new (const_allocator) MemoryAllocator(0, nullptr);
-
-  // Assemble all of the allocators into the MemoryManager that the Method
-  // will use.
-  auto* memory_manager = worker_allocator.allocateInstance<MemoryManager>();
-  ET_CHECK(memory_manager != nullptr);
-  new (memory_manager) MemoryManager(method_allocator, planned_memory);
-
-  return memory_manager;
-}
-
-Method* init_method(
-    Program* program,
-    const char* method_name,
-    MemoryAllocator& worker_allocator,
-    std::vector<size_t>& input_sizes,
-    std::vector<size_t>& output_sizes) {
-  Result<MethodMeta> method_meta = program->method_meta(method_name);
-  ET_CHECK(method_meta.ok());
-
-  MemoryManager* memory_manager =
-      create_memory_manager(&method_meta.get(), worker_allocator);
-
-  //
-  // Create and load a method from the program, using the provided
-  // allocators. The Method is what actually runs the model. It is
-  // mutable, so should only be used by a single thread at at time, but it can
-  // be reused.
-  //
-
-  auto* method = worker_allocator.allocateInstance<Method>();
-  ET_CHECK(method != nullptr);
-  auto method_res = program->load_method(method_name, memory_manager);
-  ET_CHECK_MSG(
-      method_res.error() == Error::Ok,
-      "loading method('%s') failed with status 0x%" PRIx32,
-      method_name,
-      method_res.error());
-  new (method) Method(std::move(method_res.get()));
-
-  ET_LOG(Info, "Model method '%s' initialized.", method_name);
-
-  // Gather the byte size of each input/output tensor.
-  const size_t input_size = method->inputs_size();
-  for (size_t i = 0; i < input_size; i++) {
-    if (!method->get_input(i).isTensor()) {
-      ET_LOG(Info, "input %zu is not a tensor, skipping", i);
-      continue;
-    }
-    const auto& t = method->get_input(i).toTensor();
-    input_sizes.push_back(t.nbytes());
-  }
-
-  const size_t output_size = method->outputs_size();
-  for (size_t i = 0; i < output_size; i++) {
-    const auto& t = method->get_output(i).toTensor();
-    output_sizes.push_back(t.nbytes());
-  }
-
-  return method;
-}
-
-void inference_loop(
-    Method* method,
-    const std::vector<void*>& input_buffers,
-    const std::vector<void*>& output_buffers) {
-  ET_LOG(
-      Info,
-      "Assigning input pointers, receiving %lu inputs",
-      input_buffers.size());
-
-  // Prepare the inputs.
-  {
-    size_t bufi = 0;
-    for (size_t i = 0; i < method->inputs_size(); i++) {
-      if (!method->get_input(i).isTensor()) {
-        ET_LOG(Info, "input %zu is not a tensor, skipping", i);
-        continue;
-      }
-      const auto& t = method->get_input(i).toTensor();
-      ET_CHECK_MSG(
-          bufi < input_buffers.size(), "Not enough input buffers for model");
-      t.set_data(input_buffers[bufi++]);
-    }
-  }
-  ET_LOG(Info, "Inputs prepared.");
-
-  // Prepare the outputs.
-  {
-    size_t bufi = 0;
-    for (size_t i = 0; i < method->outputs_size(); i++) {
-      if (!method->get_output(i).isTensor()) {
-        ET_LOG(Info, "output %zu is not a tensor, skipping", i);
-        continue;
-      }
-      const auto& t = method->get_output(i).toTensor();
-      ET_CHECK_MSG(
-          bufi < output_buffers.size(), "Not enough output buffers for model");
-      t.set_data(output_buffers[bufi++]);
-    }
-  }
-  ET_LOG(Info, "Outputs prepared.");
-
-  // Run the model.
-  Error status = method->execute();
-  ET_CHECK_MSG(
-      status == Error::Ok,
-      "method->execute() failed with status 0x%" PRIx32,
-      status);
-  ET_LOG(Info, "Model executed successfully.");
-}
-
-} // namespace worker
-
-/*
- * This is an example of how ExecuTorch stack should run on multiple
- * processors setup where there is a control core for memory
- * management and a worker core that runs the actual inference.
- */
-
-int main(int argc, char** argv) {
-  torch::executor::runtime_init();
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  /*
-   * Step 1: The model gets loaded from file to memory on the control core
-   */
-  std::shared_ptr<char> file_data;
-  size_t file_size;
-  Error err = torch::executor::util::read_file_content(
-      FLAGS_model_path.c_str(), &file_data, &file_size);
-  ET_CHECK_MSG(err == Error::Ok, "read_file_content failed: %d", int(err));
-
-  /*
-   * Step 2: Prepare the memory space required for worker core
-   */
-  // The actual allocation size can be backend/model specific and smaller
-  constexpr size_t kWorkerBufferSize = 1 * 1024U * 1024U; // 1 MB
-  auto worker_buffer = std::make_unique<uint8_t[]>(kWorkerBufferSize);
-  MemoryAllocator worker_allocator(kWorkerBufferSize, worker_buffer.get());
-
-  /*
-   * Step 3: The worker core sets up the corresponding data structures for the
-   * program
-   */
-  Program* program =
-      worker::load_program(file_data.get(), file_size, worker_allocator);
-  ET_LOG(
-      Info,
-      "Loaded %s and constructed program at %p",
-      FLAGS_model_path.c_str(),
-      program);
-  ET_CHECK(program != nullptr);
-
-  /*
-   * Step 4: The worker core sets up the Method. Here we let the control
-   * core read out the I/O info from the Method. This can also be done on
-   * the control core from the program flatbuffer, though there is no
-   * direct API at the moment.
-   */
-
-  // Get the method name to execute.
-  const char* method_name = nullptr;
-  {
-    // Use the first method in the program.
-    const auto method_name_result = program->get_method_name(0);
-    ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
-    method_name = *method_name_result;
-  }
-  ET_LOG(Info, "Using method %s", method_name);
-
-  std::vector<size_t> input_sizes;
-  std::vector<size_t> output_sizes;
-
-  Method* method = worker::init_method(
-      program, method_name, worker_allocator, input_sizes, output_sizes);
-
-  ET_LOG(
-      Info,
-      "Number of inputs is %lu and number of outputs is %lu",
-      input_sizes.size(),
-      output_sizes.size());
-
-  /*
-   * Step 5: The control core or the applicaton code prepares the I/O
-   */
-
-  // Allocate and initialize input/output tensor buffers for the inference
-  std::vector<void*> input_buffers;
-  for (size_t buffer_size : input_sizes) {
-    void* buffer = malloc(buffer_size);
-    memset(static_cast<char*>(buffer), 0, buffer_size);
-    input_buffers.push_back(buffer);
-  }
-  ET_LOG(Info, "Allocated the inputs");
-
-  std::vector<void*> output_buffers;
-  for (size_t buffer_size : output_sizes) {
-    void* buffer = malloc(buffer_size);
-    memset(static_cast<char*>(buffer), 0, buffer_size);
-    output_buffers.push_back(buffer);
-  }
-  ET_LOG(Info, "Allocated the outputs");
-
-  /*
-   * Step 6: The control core forwards the inference request and the worker
-   * core runs the program.
-   */
-
-  // Run the inference on the inputs. CHECK-fails on error.
-  worker::inference_loop(method, input_buffers, output_buffers);
-
-  for (void* buffer : input_buffers) {
-    free(buffer);
-  }
-  for (void* buffer : output_buffers) {
-    free(buffer);
-  }
-
-  return 0;
-}
diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh
index 2d2f8162092..e771fd4b12e 100755
--- a/test/run_oss_cpp_tests.sh
+++ b/test/run_oss_cpp_tests.sh
@@ -35,7 +35,8 @@ build_executorch() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-    -DEXECUTORCH_BUILD_SDK=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
     -DEXECUTORCH_BUILD_VULKAN=$BUILD_VULKAN \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
     -Bcmake-out
@@ -55,23 +56,23 @@ export_test_model() {
   python3 -m test.models.export_program --modules "ModuleAdd,ModuleAddHalf,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleLinear,ModuleMultipleEntry,ModuleSimpleTrain" --outdir "cmake-out" 2> /dev/null
   python3 -m test.models.export_delegated_program --modules "ModuleAddMul" --backend_id "StubBackend" --outdir "cmake-out" || true
 
+  DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH="$(realpath test/models/deprecated/ModuleLinear-no-constant-segment.pte)"
   ET_MODULE_ADD_HALF_PATH="$(realpath cmake-out/ModuleAddHalf.pte)"
   ET_MODULE_ADD_PATH="$(realpath cmake-out/ModuleAdd.pte)"
   ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH="$(realpath cmake-out/ModuleDynamicCatUnallocatedIO.pte)"
   ET_MODULE_INDEX_PATH="$(realpath cmake-out/ModuleIndex.pte)"
-  ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH="$(realpath cmake-out/ModuleLinear-no-constant-segment.pte)"
-  ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH="$(realpath cmake-out/ModuleLinear.pte)"
+  ET_MODULE_LINEAR_PATH="$(realpath cmake-out/ModuleLinear.pte)"
   ET_MODULE_MULTI_ENTRY_PATH="$(realpath cmake-out/ModuleMultipleEntry.pte)"
   ET_MODULE_ADD_MUL_NOSEGMENTS_DA1024_PATH="$(realpath cmake-out/ModuleAddMul-nosegments-da1024.pte)"
   ET_MODULE_ADD_MUL_NOSEGMENTS_PATH="$(realpath cmake-out/ModuleAddMul-nosegments.pte)"
   ET_MODULE_ADD_MUL_PATH="$(realpath cmake-out/ModuleAddMul.pte)"
   ET_MODULE_SIMPLE_TRAIN_PATH="$(realpath cmake-out/ModuleSimpleTrain.pte)"
+  export DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH
   export ET_MODULE_ADD_HALF_PATH
   export ET_MODULE_ADD_PATH
   export ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH
   export ET_MODULE_INDEX_PATH
-  export ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH
-  export ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH
+  export ET_MODULE_LINEAR_PATH
   export ET_MODULE_MULTI_ENTRY_PATH
   export ET_MODULE_ADD_MUL_NOSEGMENTS_DA1024_PATH
   export ET_MODULE_ADD_MUL_NOSEGMENTS_PATH
@@ -121,7 +122,7 @@ probe_tests() {
     kernels
     runtime
     schema
-    sdk
+    devtools
     test
   )
 
diff --git a/test/targets.bzl b/test/targets.bzl
index 5c2a28cad15..3c2a69f592b 100644
--- a/test/targets.bzl
+++ b/test/targets.bzl
@@ -49,42 +49,3 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
     )
-
-    # Test binary that can create multiple Executor instances in the same
-    # process.
-    runtime.cxx_binary(
-        name = "multi_runner",
-        srcs = ["multi_runner.cpp"],
-        deps = [
-            "//executorch/runtime/core:core",
-            "//executorch/kernels/portable:generated_lib",
-            "//executorch/runtime/executor:program",
-            "//executorch/runtime/executor/test:managed_memory_manager",
-            "//executorch/extension/data_loader:buffer_data_loader",
-            "//executorch/util:read_file",
-            "//executorch/util:util",
-        ],
-        external_deps = [
-            "gflags",
-        ],
-        **get_oss_build_kwargs()
-    )
-
-    # Test binary that can create relocatable Executor instances.
-    runtime.cxx_binary(
-        name = "relocatable_runner",
-        srcs = ["relocatable_runner.cpp"],
-        deps = [
-            "//executorch/kernels/portable:generated_lib",
-            "//executorch/runtime/executor:program",
-            "//executorch/configurations:executor_cpu_optimized",
-            "//executorch/extension/data_loader:buffer_data_loader",
-            "//executorch/util:read_file",
-            "//executorch/util:util",
-        ],
-        external_deps = [
-            "gflags",
-        ],
-        preprocessor_flags = [],
-        define_static_target = True,
-    )
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json
index 38fa31cd6fb..dca2a7bbbce 100644
--- a/test/utils/OSSTestConfig.json
+++ b/test/utils/OSSTestConfig.json
@@ -37,7 +37,7 @@
         ],
         "additional_libs": [
             "extension_data_loader",
-            "extension_module_static",
+            "extension_module",
             "portable_kernels",
             "portable_ops_lib"
         ]
@@ -52,8 +52,7 @@
     {
         "directory": "extension/runner_util/test",
         "sources": [
-            "inputs_test.cpp",
-            "managed_tensor_test.cpp"
+            "inputs_test.cpp"
         ],
         "additional_libs": [
             "extension_data_loader",
@@ -62,6 +61,17 @@
             "portable_ops_lib"
         ]
     },
+    {
+        "directory": "extension/tensor/test",
+        "sources": [
+            "tensor_impl_ptr_test.cpp",
+            "tensor_ptr_maker_test.cpp",
+            "tensor_ptr_test.cpp"
+        ],
+        "additional_libs": [
+            "extension_tensor"
+        ]
+    },
     {
         "directory": "kernels/portable/cpu/util/test",
         "sources": [
@@ -77,7 +87,7 @@
         "directory": "runtime/core/portable_type/test",
         "sources": [
             "optional_test.cpp",
-            "executor_tensor_test.cpp",
+            "tensor_test.cpp",
             "half_test.cpp",
             "scalar_test.cpp",
             "tensor_impl_test.cpp"
diff --git a/util/activation_memory_profiler.py b/util/activation_memory_profiler.py
index 5f5185dd7b4..f459dfafaf0 100644
--- a/util/activation_memory_profiler.py
+++ b/util/activation_memory_profiler.py
@@ -106,6 +106,7 @@ def generate_memory_trace(
     executorch_program_manager: ExecutorchProgramManager,
     chrome_trace_filename: str,
     enable_memory_offsets: bool = False,
+    method_name: str = "forward",
 ):
     """
     Generate the memory timeline from the given ExecuTorch program.
@@ -122,7 +123,7 @@ def generate_memory_trace(
             f"generate_memory_trace expects ExecutorchProgramManager instance but got {type(executorch_program_manager)}"
         )
 
-    exported_program = executorch_program_manager.exported_program()
+    exported_program = executorch_program_manager.exported_program(method_name)
     if not _validate_memory_planning_is_done(exported_program):
         raise ValueError("Executorch program does not have memory planning.")
 
diff --git a/util/read_file.cpp b/util/read_file.cpp
deleted file mode 100644
index 28713455144..00000000000
--- a/util/read_file.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/util/read_file.h>
-
-#include <executorch/runtime/platform/log.h>
-
-#include <stdio.h>
-#include <memory>
-
-namespace torch {
-namespace executor {
-namespace util {
-
-ET_NODISCARD Error read_file_content(
-    const char* file_name,
-    std::shared_ptr<char>* file_data,
-    size_t* file_length) {
-  FILE* file;
-  unsigned long fileLen;
-
-  // Open file
-  file = fopen(file_name, "rb");
-  if (!file) {
-    ET_LOG(Error, "Unable to open file %s\n", file_name);
-    return Error::NotSupported;
-  }
-
-  // Get file length
-  fseek(file, 0, SEEK_END);
-  fileLen = ftell(file);
-  fseek(file, 0, SEEK_SET);
-
-  // Allocate memory
-  auto ptr = std::shared_ptr<char>(
-      new char[fileLen + 1], std::default_delete<char[]>());
-  if (!ptr) {
-    ET_LOG(Error, "Unable to allocate memory to read file %s\n", file_name);
-    fclose(file);
-    return Error::NotSupported;
-  }
-
-  // Read file contents into buffer
-  fread(ptr.get(), fileLen, 1, file);
-  fclose(file);
-
-  *file_data = ptr;
-  *file_length = fileLen;
-  return Error::Ok;
-}
-
-ET_DEPRECATED std::shared_ptr<char> read_file_content(const char* name) {
-  std::shared_ptr<char> file_data;
-  size_t file_length;
-  Error status = read_file_content(name, &file_data, &file_length);
-  if (status == Error::Ok) {
-    return file_data;
-  } else {
-    return nullptr;
-  }
-}
-
-} // namespace util
-} // namespace executor
-} // namespace torch
diff --git a/util/read_file.h b/util/read_file.h
deleted file mode 100644
index 50abcbdec22..00000000000
--- a/util/read_file.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/compiler.h>
-#include <memory>
-#include <string>
-
-namespace torch {
-namespace executor {
-namespace util {
-
-/**
- * Read the data from the file given name.
- *
- * The returned pointer pointing to the memory address containing the data, and
- * the file length is the length of data.
- *
- * @param[in] file_name The name of the file to be read.
- * @param[out] file_data The file data, if read successfully.
- * @param[out] file_length The length of file_data, in bytes, if read
- * successfully.
- *
- * @returns Error::Ok if the file is read successfully, file_data point to the
- * data and file_length is the correct length of file_data. Other values on
- * failure.
- */
-ET_NODISCARD Error read_file_content(
-    const char* file_name,
-    std::shared_ptr<char>* file_data,
-    size_t* file_length);
-
-/**
- * Read the data from the file given name.
- *
- * The returned pointer pointing to the memory address containing the data.
- *
- * This function is deprecated, and should use the above function instead to
- * read file content.
- *
- * @param[in] name The name of the file to be read.
- *
- * @returns The pointer to file data, if read successfully. Otherwise null_ptr.
- */
-ET_DEPRECATED std::shared_ptr<char> read_file_content(const char* name);
-
-} // namespace util
-} // namespace executor
-} // namespace torch
diff --git a/util/targets.bzl b/util/targets.bzl
index c8b70f5c818..5c5d7401d51 100644
--- a/util/targets.bzl
+++ b/util/targets.bzl
@@ -7,40 +7,6 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    runtime.cxx_library(
-        name = "read_file",
-        srcs = ["read_file.cpp"],
-        exported_headers = ["read_file.h"],
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        exported_deps = [
-            "//executorch/runtime/core:core",
-            "//executorch/runtime/platform:compiler",
-        ],
-    )
-
-    for aten_mode in (True, False):
-        aten_suffix = ("_aten" if aten_mode else "")
-
-        # DEPRECATED: Remove this once all users have migrated to
-        # extension/runner_util:inputs.
-        runtime.cxx_library(
-            name = "util" + aten_suffix,
-            srcs = [],
-            exported_headers = ["util.h"],
-            visibility = [
-                "//executorch/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            exported_deps = [
-                "//executorch/extension/runner_util:inputs" + aten_suffix,
-                "//executorch/runtime/core:core",
-                "//executorch/runtime/executor:program" + aten_suffix,
-            ],
-        )
-
     if not runtime.is_oss:
         runtime.python_library(
             name = "python_profiler",
diff --git a/util/util.h b/util/util.h
deleted file mode 100644
index 4974afdf167..00000000000
--- a/util/util.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/**
- * @file
- * DEPRECATED: Do not use this file or add new functions to it.
- */
-
-#pragma once
-
-#include <executorch/extension/runner_util/inputs.h>
-#include <executorch/runtime/core/array_ref.h>
-#include <executorch/runtime/executor/method.h>
-
-namespace torch {
-namespace executor {
-namespace util {
-
-/**
- * DEPRECATED: Use prepare_input_tensors() instead.
- *
- * Allocates input tensors for the provided Method, filling them with ones.
- *
- * @param[in] method The Method that owns the inputs to prepare.
- * @returns An array of pointers that must be passed to `FreeInputs()` after
- *     the Method is no longer needed.
- */
-ET_DEPRECATED
-inline exec_aten::ArrayRef<void*> PrepareInputTensors(Method& method) {
-  Result<BufferCleanup> inputs = prepare_input_tensors(method);
-  ET_CHECK(inputs.ok());
-  // A hack to work with the deprecated signature. Return an ArrayRef that
-  // points to a single BufferCleanup.
-  return {
-      reinterpret_cast<void**>(new BufferCleanup(std::move(inputs.get()))), 1};
-}
-
-/**
- * DEPRECATED: Use prepare_input_tensors() instead, which does not need this.
- *
- * Frees memory that was allocated by `PrepareInputTensors()`.
- */
-ET_DEPRECATED
-inline void FreeInputs(exec_aten::ArrayRef<void*> inputs) {
-  ET_CHECK(inputs.size() == 1);
-  // A hack to work with the deprecated signature. The ArrayRef points to a
-  // single BufferCleanup for us to delete.
-  delete reinterpret_cast<BufferCleanup*>(const_cast<void**>(inputs.data()));
-}
-
-} // namespace util
-} // namespace executor
-} // namespace torch
diff --git a/version.txt b/version.txt
index f28aaa5cd4e..515423ed567 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.4.0a0
+0.5.0a0